In [3]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns
from ucimlrepo import fetch_ucirepo

# Step 1: Load the dataset
dataset = fetch_ucirepo(id=320)
data_url = dataset['metadata']['data_url']

# Load the data from the URL
data = pd.read_csv(data_url)

# Extract variable names and data
variables = dataset['variables']
feature_names = variables['name'].tolist()
data.columns = feature_names

# Convert appropriate columns to numeric
for col in data.columns:
    data[col] = pd.to_numeric(data[col], errors='ignore')

# Identify the new target variable
target = 'G3'


  data[col] = pd.to_numeric(data[col], errors='ignore')


In [4]:
# Initial Data Exploration (Basic Information)
print("Basic Information:")
print(data.info())
print("\nFirst few rows of the dataset:")
print(data.head())

print("\nSummary Statistics for Numerical Features:")
print(data.describe())

print("\nSummary Statistics for Categorical Features:")
print(data.describe(include=[object]))

Basic Information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 649 entries, 0 to 648
Data columns (total 33 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   school      649 non-null    object
 1   sex         649 non-null    object
 2   age         649 non-null    int64 
 3   address     649 non-null    object
 4   famsize     649 non-null    object
 5   Pstatus     649 non-null    object
 6   Medu        649 non-null    int64 
 7   Fedu        649 non-null    int64 
 8   Mjob        649 non-null    object
 9   Fjob        649 non-null    object
 10  reason      649 non-null    object
 11  guardian    649 non-null    object
 12  traveltime  649 non-null    int64 
 13  studytime   649 non-null    int64 
 14  failures    649 non-null    int64 
 15  schoolsup   649 non-null    object
 16  famsup      649 non-null    object
 17  paid        649 non-null    object
 18  activities  649 non-null    object
 19  nursery     649 non-null    obj

In [5]:
# Step 3: Handling Missing Values
# Separate numerical and categorical features
numerical_features = data.select_dtypes(include=[np.number]).columns.tolist()
categorical_features = data.select_dtypes(include=[object]).columns.tolist()

# Impute missing values for numerical features
imputer_num = SimpleImputer(strategy='mean')
data[numerical_features] = imputer_num.fit_transform(data[numerical_features])

# Explanation:
# Missing values can cause errors in data analysis and machine learning models. Imputing with the mean is a common strategy for numerical features to maintain data consistency.

In [6]:
# Step 4: Encoding Categorical Variables
# Convert categorical columns to numerical using one-hot encoding
data = pd.get_dummies(data, drop_first=True)

# Explanation:
# Categorical variables need to be converted to numerical form for machine learning algorithms. One-hot encoding is a common method.

In [9]:
# Step 5: Outlier Removal
# Outlier removal using the IQR method for numerical features only
numeric_data = data.select_dtypes(include=[np.number])
Q1 = numeric_data.quantile(0.25)
Q3 = numeric_data.quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Only remove outliers for numerical columns
data = data[~((numeric_data < lower_bound) | (numeric_data > upper_bound)).any(axis=1)]

# Explanation:
# Outliers can skew the results of data analysis and modeling. The IQR method is used to identify and remove outliers from numerical features only, ensuring a more robust analysis.

In [10]:
# Step 6: Normalizing Numerical Features
scaler = StandardScaler()
data[numerical_features] = scaler.fit_transform(data[numerical_features])

# Explanation:
# Normalizing numerical features ensures that all features contribute equally to the analysis and models, preventing features with larger scales from dominating.

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[numerical_features] = scaler.fit_transform(data[numerical_features])


In [11]:
# Step 7: Splitting the Dataset into Train and Test Sets
X = data.drop(columns=[target])
y = data[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Explanation:
# Splitting the data into training and testing sets allows for the evaluation of model performance on unseen data, helping to prevent overfitting.

In [12]:
# Display basic information about the processed data
print("Basic Information after Preprocessing:")
print(data.info())

print("\nFirst few rows of the processed dataset:")
print(data.head())

print("\nSummary Statistics of the processed dataset:")
print(data.describe())

# Save the cleaned dataset for further analysis
data.to_csv('cleaned_student_data2.csv', index=False)
print("Data preprocessing and cleaning complete.")

Basic Information after Preprocessing:
<class 'pandas.core.frame.DataFrame'>
Index: 393 entries, 1 to 648
Data columns (total 42 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   age                393 non-null    float64
 1   Medu               393 non-null    float64
 2   Fedu               393 non-null    float64
 3   traveltime         393 non-null    float64
 4   studytime          393 non-null    float64
 5   failures           393 non-null    float64
 6   famrel             393 non-null    float64
 7   freetime           393 non-null    float64
 8   goout              393 non-null    float64
 9   Dalc               393 non-null    float64
 10  Walc               393 non-null    float64
 11  health             393 non-null    float64
 12  absences           393 non-null    float64
 13  G1                 393 non-null    float64
 14  G2                 393 non-null    float64
 15  G3                 393 non-null    float