<a href="https://colab.research.google.com/github/AbdoulWadoudou/Projet_NLP/blob/main/Projet.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import zipfile
import io
import requests

In [6]:
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00320/student.zip'


In [7]:
response = requests.get(url)
zip_file = zipfile.ZipFile(io.BytesIO(response.content))
df = pd.read_csv(zip_file.open('student-mat.csv'), sep=';')


In [8]:
print(df.head())
print(df.info())
print(df.describe())

  school sex  age address famsize Pstatus  Medu  Fedu     Mjob      Fjob  ...  \
0     GP   F   18       U     GT3       A     4     4  at_home   teacher  ...   
1     GP   F   17       U     GT3       T     1     1  at_home     other  ...   
2     GP   F   15       U     LE3       T     1     1  at_home     other  ...   
3     GP   F   15       U     GT3       T     4     2   health  services  ...   
4     GP   F   16       U     GT3       T     3     3    other     other  ...   

  famrel freetime  goout  Dalc  Walc health absences  G1  G2  G3  
0      4        3      4     1     1      3        6   5   6   6  
1      5        3      3     1     1      3        4   5   5   6  
2      4        3      2     2     3      3       10   7   8  10  
3      3        2      2     1     1      5        2  15  14  15  
4      4        3      2     1     2      5        4   6  10  10  

[5 rows x 33 columns]
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 395 entries, 0 to 394
Data columns (to

In [9]:
X = df.drop('G3', axis=1)
y = df['G3']

In [10]:
num_features = X.select_dtypes(include=['int64', 'float64']).columns
cat_features = X.select_dtypes(include=['object']).columns

In [11]:
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

In [12]:
categorical_transformer = Pipeline(steps=[
    ('encoder', OneHotEncoder(drop='first'))
])

In [13]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, num_features),
        ('cat', categorical_transformer, cat_features)
    ])

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [15]:
models = {
    'Linear Regression': LinearRegression(),
    'Decision Tree': DecisionTreeRegressor(random_state=42),
    'Random Forest': RandomForestRegressor(random_state=42),
    'Gradient Boosting': GradientBoostingRegressor(random_state=42),
    'Neural Network': MLPRegressor(random_state=42, max_iter=1000)
}

In [16]:
def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    return mse, mae, r2

In [17]:
results = {}

for name, model in models.items():
    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('model', model)
    ])
    pipeline.fit(X_train, y_train)
    mse, mae, r2 = evaluate_model(pipeline, X_test, y_test)
    results[name] = {'MSE': mse, 'MAE': mae, 'R2': r2}



In [18]:
results_df = pd.DataFrame(results).T
print(results_df)

                        MSE       MAE        R2
Linear Regression  5.656643  1.646666  0.724134
Decision Tree      4.202532  1.139241  0.795049
Random Forest      3.834766  1.179114  0.812984
Gradient Boosting  4.018748  1.159379  0.804012
Neural Network     8.237853  1.998480  0.598252


In [19]:
baseline_pred = np.mean(y_train)
baseline_mse = mean_squared_error(y_test, [baseline_pred] * len(y_test))
baseline_mae = mean_absolute_error(y_test, [baseline_pred] * len(y_test))
baseline_r2 = r2_score(y_test, [baseline_pred] * len(y_test))

baseline_results = {
    'Baseline': {'MSE': baseline_mse, 'MAE': baseline_mae, 'R2': baseline_r2}
}

baseline_results_df = pd.DataFrame(baseline_results).T
print(baseline_results_df)

              MAE        MSE       R2
Baseline  3.64585  20.704144 -0.00971


In [20]:
comparison_df = pd.concat([results_df, baseline_results_df])
print(comparison_df)

                         MSE       MAE        R2
Linear Regression   5.656643  1.646666  0.724134
Decision Tree       4.202532  1.139241  0.795049
Random Forest       3.834766  1.179114  0.812984
Gradient Boosting   4.018748  1.159379  0.804012
Neural Network      8.237853  1.998480  0.598252
Baseline           20.704144  3.645850 -0.009710
