## Predicting student exam scores. With Student Performance dataset on kaggle. We will EDA, then split the data and apply machine learning model (Linear Regression, Decision Tree, Random Forest, XGBoost, SVM) to comment which model is best for the data


In [1]:
#Import Libaries
import pandas as pd
import numpy as np

### 1. Read data description and summary



In [2]:
with open("/kaggle/input/house-prices-advanced-regression-techniques/data_description.txt", "r") as f:
    content = f.read()
print(content)  

MSSubClass: Identifies the type of dwelling involved in the sale.	

        20	1-STORY 1946 & NEWER ALL STYLES
        30	1-STORY 1945 & OLDER
        40	1-STORY W/FINISHED ATTIC ALL AGES
        45	1-1/2 STORY - UNFINISHED ALL AGES
        50	1-1/2 STORY FINISHED ALL AGES
        60	2-STORY 1946 & NEWER
        70	2-STORY 1945 & OLDER
        75	2-1/2 STORY ALL AGES
        80	SPLIT OR MULTI-LEVEL
        85	SPLIT FOYER
        90	DUPLEX - ALL STYLES AND AGES
       120	1-STORY PUD (Planned Unit Development) - 1946 & NEWER
       150	1-1/2 STORY PUD - ALL AGES
       160	2-STORY PUD - 1946 & NEWER
       180	PUD - MULTILEVEL - INCL SPLIT LEV/FOYER
       190	2 FAMILY CONVERSION - ALL STYLES AND AGES

MSZoning: Identifies the general zoning classification of the sale.
		
       A	Agriculture
       C	Commercial
       FV	Floating Village Residential
       I	Industrial
       RH	Residential High Density
       RL	Residential Low Density
       RP	Residential Low Density Park 
       RM

### 2. load dataset and Explore data



In [3]:
train_df = pd.read_csv("/kaggle/input/house-prices-advanced-regression-techniques/train.csv")
test_df = pd.read_csv("/kaggle/input/house-prices-advanced-regression-techniques/test.csv")

In [4]:
#Shape dataset
print("Train shape:", train_df.shape)
print("Test shape:", test_df.shape)

Train shape: (1460, 81)
Test shape: (1459, 80)


In [5]:
train_df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [6]:
import pandas as pd
import numpy as np

def eda_descriptive_stats(df):
    """
    Performs basic Exploratory Data Analysis (EDA) and provides quick conclusions 
    from df.describe() for numerical columns.

    Args:
        df (pd.DataFrame): The input dataset.
    
    Returns:
        pd.DataFrame: The rounded descriptive statistics table (df.describe()).
    """
    print("--- DATA OVERVIEW ---")
    print(f"Number of Rows (Entries): {df.shape[0]}")
    print(f"Number of Columns (Features): {df.shape[1]}")
    
    # Print df.info() first for data types and memory usage
    print("\nData Types of Columns:")
    df.info() 
    print("-------------------------")
    
    # Select only numerical columns
    df_numerical = df.select_dtypes(include=np.number)
    if df_numerical.empty:
        print("The dataset contains no numerical columns for statistical analysis.")
        return None

    # Get the descriptive statistics table
    stats_df = df_numerical.describe().T
    
    # Round for easier reading
    stats_df = stats_df.round(2)
    
    print("\n--- DESCRIPTIVE STATISTICS ANALYSIS (df.describe().T) ---")
    print(stats_df)
    print("\n=========================================================")
    print("--- QUICK CONCLUSIONS FOR NUMERICAL DATA ---")

    # The total number of rows in the original DataFrame
    total_rows = df.shape[0]

    for col in stats_df.index:
        count = stats_df.loc[col, 'count']
        mean = stats_df.loc[col, 'mean']
        std = stats_df.loc[col, 'std']
        min_val = stats_df.loc[col, 'min']
        median = stats_df.loc[col, '50%']
        q1 = stats_df.loc[col, '25%']
        q3 = stats_df.loc[col, '75%']
        max_val = stats_df.loc[col, 'max']
        
        # Calculate IQR for more precise outlier check
        iqr = q3 - q1
        
        conclusion = f"**Column '{col}':**\n"
        
        # 1. Check for Missing Values
        missing_count = total_rows - count
        if missing_count > 0:
            missing_percent = (missing_count / total_rows) * 100
            conclusion += f"  - ⚠️ **Missing Data:** {missing_count} values ({missing_percent:.2f}%). Needs imputation/handling.\n"
        
        # 2. Check Distribution (Skewness)
        if mean > median * 1.1: # Significant positive skew
            conclusion += f"  - ➡️ **Right Skew:** Mean ({mean}) > Median ({median}). Likely due to **high outliers**.\n"
        elif mean < median * 0.9: # Significant negative skew
            conclusion += f"  - ⬅️ **Left Skew:** Mean ({mean}) < Median ({median}). Likely due to **low outliers**.\n"
        else:
            conclusion += f"  - ⚖️ **Near Symmetric:** Mean ({mean}) ≈ Median ({median}). Distribution is relatively balanced.\n"
            
        # 3. Check for Outliers (using IQR rule for a better simple check)
        upper_bound_iqr = q3 + 1.5 * iqr
        lower_bound_iqr = q1 - 1.5 * iqr
        
        outlier_notes = []
        if max_val > upper_bound_iqr * 1.5: # Use 1.5*IQR bound check as a strong indicator
             outlier_notes.append(f"Max ({max_val}) is significantly beyond the expected upper bound. Strong **High Outlier** detected.")
        if min_val < lower_bound_iqr and min_val != 0: 
             outlier_notes.append(f"Min ({min_val}) is significantly below the expected lower bound. Possible **Low Outlier** or **invalid data**.")

        if outlier_notes:
            conclusion += f"  - ⚡️ **Outlier Check:** {' '.join(outlier_notes)}\n"
        
        # 4. Check Variability (Coefficient of Variation - CV is better, but simple range check used here)
        range_val = max_val - min_val
        if range_val > 0 and std / mean > 0.5: # Simple check: Std is high relative to the mean
            conclusion += f"  - 📈 **High Variability:** Standard Deviation ({std}) is high relative to the mean. Data is widely dispersed.\n"
        else:
             conclusion += f"  - 📉 **Moderate Variability:** Data is relatively stable (Std: {std}).\n"
        
        print(conclusion)
        print("-" * 30)
    
    return stats_df

# --- Example Usage (Using the same dummy data structure) ---

# 1. Create a dummy dataset (must define 'np' if not already done)
try:
    # This block ensures 'np' is available for the example
    import numpy as np
except ImportError:
    print("NumPy not imported. Please ensure 'import numpy as np' is at the top.")

# Convert ID (int64) to object/string so it's not included in numerical stats
train_df['Id'] = train_df['Id'].astype('object')

# 2. Run the analysis function
print("\n>>> ANALYSIS RESULTS FOR DF_TRAIN <<<")
results_df = eda_descriptive_stats(train_df)


>>> ANALYSIS RESULTS FOR DF_TRAIN <<<
--- DATA OVERVIEW ---
Number of Rows (Entries): 1460
Number of Columns (Features): 81

Data Types of Columns:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   object 
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   obje

In [7]:
#resolve missing data
# for column LotFrontage(type float) so i use imputation
median_lot_frontage = train_df['LotFrontage'].median()
train_df['LotFrontage'] = train_df['LotFrontage'].fillna(median_lot_frontage)
print(f"value null after resolve: {train_df['LotFrontage'].isnull().sum()}")

value null after resolve: 0
