In [10]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
plt.rcParams.update({'figure.max_open_warning': 0}) # Hide warnings

In [11]:
# Loading Files, df_mat for maths, df_por for portuguese
df_mat =pd.read_csv('student/student-mat.csv',sep=';')
df_por =pd.read_csv('student-por.csv',sep=';')

In [12]:
# inspecting

display(df_mat.head(), 
        df_por.head(),
        df_mat.describe(),
        df_por.describe(),
        df_mat.info(),
        df_por.info())

In [13]:
# Merging both datframes
df=pd.concat([df_mat, df_por], ignore_index=True)
display(df.head(), df.describe(include="all").round().T)

In [14]:
#Check for null values
df.isna().sum()

In [15]:
# Change categorical columns to datatype "category"

for col in['school', 'sex', 'address', 'famsize', 'Pstatus', 'Medu', 'Fedu',
       'Mjob', 'Fjob', 'reason', 'guardian', 'traveltime', 'studytime',
       'failures', 'schoolsup', 'famsup', 'paid', 'activities', 'nursery',
       'higher', 'internet', 'romantic', 'famrel', 'freetime', 'goout', 'Dalc',
       'Walc', 'health']:
    df[col] = df[col].astype('category')

In [16]:
df.info()

In [17]:
# To display th Value counts

result = df.select_dtypes(include='category')

for i in result:
    display(df.groupby(i).
            count().T.
            head(1).
            reset_index().
            drop('index',axis=1))
    

# Univarate Analysis

-------------------------------------------------

In [18]:
# Distribution of Categotical Varriable


df1 = df.select_dtypes(exclude=['int64'])
for i, col in enumerate(df1.columns):
    plt.figure(i)
    sns.countplot(x=col, data=df1,palette='magma')
    plt.show()

In [19]:
# Distribution of Numerical Varriable

df1 = df.select_dtypes(include=['int64'])
for i, col in enumerate(df1.columns):
    plt.figure(i)
    sns.histplot(x=col, data=df1,bins=10)
    plt.show()

# Bivarate Analysis

## Schoolsup with the Grades of the subject
(schoolsup : extra educational support (binary: yes or no))

In [20]:
sns.barplot(data=df, x="schoolsup", y="G3");

## Family Support with the grades of the subject
(family educational support (binary: yes or no))

In [21]:
sns.barplot(data=df, x="famsup", y="G3");

## Nursery class status with the grades of the subject
(attended nursery school (binary: yes or no))

In [22]:
sns.barplot(data=df, x="nursery", y="G3");

## Health with the grades of the subject
(current health status (numeric: from 1 : very bad to 5 : very good))

In [23]:
sns.barplot(data=df, x="health", y="G3");

## Effect of famsize on students performance
(famsize : family size (binary: "LE3" : less or equal to 3 or "GT3" : greater than 3))

In [24]:
sns.boxplot(data=df, x="famsize", y="G3");

## Effect of Pstatus on students performance
(Pstatus : parent's cohabitation status (binary: "T" : living together or "A" : apart))

In [25]:
sns.boxplot(data=df, x="Pstatus", y="G3");

## Effect of Medu and Fedu on students performance
- Medu : mother's education (numeric: 0 : none, 1 : primary education (4th grade), 2 : 5th to 9th grade, 3 : secondary education or 4: higher education) 
- Fedu : father's education (numeric: 0 : none, 1 : primary education (4th grade), 2 : 5th to 9th grade, 3 : secondary education or 4 : higher education)


In [26]:
fig, ax =plt.subplots(1,2,figsize=(20,6), dpi=80, facecolor='w', edgecolor='k')
sns.boxplot(data=df, x="Medu", y="G3", ax=ax[0]).set_title('Mother Education Level')
ax[0].set_xticklabels( ('none', 'primary education (4th grade)','5th to 9th grade','secondary education','higher education'),rotation='vertical' )
ax[0].set_xlabel('mother education')
ax[0].set_ylabel('Final Grades')
sns.boxplot(data=df, x="Fedu", y="G3", ax=ax[1]).set_title('Father Education Level')
ax[1].set_xticklabels( ('none', 'primary education (4th grade)','5th to 9th grade','secondary education','higher education'),rotation='vertical' )
ax[1].set_xlabel('Father education')
ax[1].set_ylabel('Final Grades')
fig.suptitle('Effect of Parents Education on students performancee', fontsize=16)
fig.show()

## Effect of Mother and Father Job on students performance
 - Mjob : mother's job (nominal: "teacher", "health" care related, civil "services" (e.g. administrative or police), "at_home" or "other")

- Fjob : father's job (nominal: "teacher", "health" care related, civil "services" (e.g. administrative or police), "at_home" or "other"

In [27]:
fig, ax =plt.subplots(1,2,figsize=(20,6), dpi=80, facecolor='w', edgecolor='k')
sns.boxplot(data=df, x="Mjob", y="G3", ax=ax[0]).set_title('Mother Job Level')
ax[0].set_xticklabels( ('teacher', 'health','services','at_home','other'),rotation='vertical' )
ax[0].set_xlabel('Mother Job')
ax[0].set_ylabel('Final Grades')
sns.boxplot(data=df, x="Fjob", y="G3", ax=ax[1]).set_title('Father Job Level')
ax[1].set_xticklabels( ('teacher', 'health','services','at_home','other'),rotation='vertical' )
ax[1].set_xlabel('Father Job')
ax[1].set_ylabel('Final Grades')
fig.suptitle('Effect of Parents Job on students performancee', fontsize=16)
fig.show()

## Interrelationship Between Different Grading Scheme

In [28]:
fig, ax =plt.subplots(1,3,figsize=(20,6), dpi=80, facecolor='w', edgecolor='k')
sns.scatterplot(data=df, x="G1", y="G3",ax=ax[0])
sns.scatterplot(data=df, x="G2", y="G3",ax=ax[1])
sns.scatterplot(data=df, x="G1", y="G2",ax=ax[2])
sns.cubehelix_palette(start=.5, rot=-.5, as_cmap=True)
fig.suptitle('Relationship Between Different Grade with each other', fontsize=16)
fig.show()

In [29]:
sns.set_theme(style="ticks")

In [30]:
sns.jointplot(x=df['G1'], y=df['G3'], kind="hex", color="#4CB391")
sns.jointplot(x=df['G2'], y=df['G3'], kind="hex", color="#4CB391")
sns.jointplot(x=df['G1'], y=df['G2'], kind="hex", color="#4CB391")
fig.show()

In [31]:
sns.jointplot(x=df['G1'], y=df['G2'], kind="hex", color="#4CB391",bins=6)

In [32]:
df.columns

In [33]:
sns.histplot(
    df, x="absences", y="G3",
    bins=5, discrete=(True, False),
    cbar=True, cbar_kws=dict(shrink=.75)
)

In [34]:
sns.histplot(
    df, x="G3", hue="sex", element="step",
    stat="density", common_norm=False,bins=6
)

In [35]:
sns.histplot(
    df, x="G3", hue="failures", element="step",
    stat="density", common_norm=False,bins=6
)

In [36]:
sns.histplot(
    df, x="G3", hue="studytime", element="step",
    stat="density", common_norm=False,bins=6
)


# Multivarate Analysis

In [37]:
sns.histplot(
    df,x="absences", y="address", hue="sex", legend=True,bins=10
);

In [38]:
sns.histplot(
    df,x="absences", y="G3", hue="sex", legend=True,bins=10
);

In [39]:
sns.histplot(
    df,x="absences", y="G3", hue="address", legend=True,bins=10
);

### Correlation between different numrical varriable

In [40]:
print(df.corr())
  
# plotting correlation heatmap
fig, ax = plt.subplots(figsize=(20,5)) 
mask = np.triu(np.ones_like(df.corr()))
dataplot = sns.heatmap(df.corr(), cmap="YlGnBu", annot=True,mask=mask,annot_kws={'size': 13},ax=ax)
  
# displaying heatmap
plt.show()

# Modeling

In [42]:
from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score
from sklearn.ensemble import RandomForestRegressor

## Feature Engenering

In [43]:
# Outlier removal

result = df.select_dtypes(include='number')#selecting dtypes in dataset

for i in result.columns:
    percentile25 = df[i].quantile(0.25)
    percentile75 = df[i].quantile(0.75)
    
    iqr = percentile75-percentile25
    
    upper_limit = percentile75 + 1.5 * iqr
    lower_limit = percentile25 - 1.5 * iqr
    
    df[df[i] > upper_limit]
    df[df[i] < lower_limit]
    
    df_new = df[df[i] < upper_limit ]
    df_new = df[df[i] > lower_limit ]

## 1. Linner Regression Model

In [44]:

#  Spliting Column in Train and Test set

X_train, X_test, y_train, y_test = train_test_split(df_new.drop(columns=['G1','G3','G2','absences']), 
                                                    df_new['G3'], 
                                                    test_size=.2, 
                                                    random_state=10)

In [45]:
#  Further Divide dataframe into numerical and categorical type of train dataset

# Define categorical columns
categorical = list(X_train.select_dtypes('category').columns)
print(f"Categorical columns are: {categorical}")

# Define numerical columns
numerical = list(X_train.select_dtypes('number').columns)
print(f"Numerical columns are: {numerical}")

In [46]:
#  One Hot enconding of cat dataset

cat_pipe = Pipeline([
    ('encoder', OneHotEncoder(handle_unknown='ignore', sparse=False))
])

In [47]:
#  Scaling of numerical dataset

from sklearn.preprocessing import Normalizer
num_pipe = Pipeline([('scaler', Normalizer())])

In [48]:
#  Combining of both categorical and numerical pipeline

# Combine categorical and numerical pipelines
preprocessor = ColumnTransformer([
    ('cat', cat_pipe, categorical),
    ('num', num_pipe, numerical)
])

In [49]:
# Fixing Pipeline on Linner regression

pipe = Pipeline([
    ('preprocessor', preprocessor),
    ('model', LinearRegression())
])

pipe.fit(X_train, y_train)

In [50]:
#  Predict training data

y_train_pred = pipe.predict(X_train)
print(f"Predictions on training data: {y_train_pred}")

In [51]:
# Predict test data

y_test_pred = pipe.predict(X_test)
print(f"Predictions on test data: {y_test_pred}")

In [52]:
# Check r square value

r2 = r2_score(y_test, y_test_pred)
print('r2 score for a  model is', r2)

## 2. Fitting Random Forest

In [53]:
# New dataframe
df_rf=df_new.copy(deep=True)

# Spliting dataset in Train and Test set

X_train_n, X_test_n, y_train_n, y_test_n = train_test_split(df_rf.drop(columns=['G1','G3','G2', 'address', 
                                                                                'famsize', 'guardian', 'traveltime','absences','romantic']), 
                                                    df_rf['G3'], 
                                                    test_size=.2, 
                                                    random_state=10)

In [54]:
# Define categorical columns

categorical_n = list(X_train_n.select_dtypes('category').columns)
print(f"Categorical columns are: {categorical_n}")

In [55]:
# Define numerical columns

numerical_n = list(X_train_n.select_dtypes('number').columns)
print(f"Numerical columns are: {numerical_n}")

In [56]:
# One Hot encoding

cat_pipe_n = Pipeline([
    ('encoder', OneHotEncoder(handle_unknown='ignore', sparse=False))
])

In [57]:
# Normelization

num_pipe_n = Pipeline([('scaler', Normalizer())])

In [58]:
# Combine categorical and numerical pipelines

preprocessor_n = ColumnTransformer([
    ('cat', cat_pipe_n, categorical_n),
    ('num', num_pipe_n, numerical_n)
])

In [59]:
# Fit a pipeline with transformers and an estimator to the training data

pipe_n = Pipeline([
    ('preprocessor', preprocessor_n),
    ('model', RandomForestRegressor(max_depth=10, random_state=8))])
pipe_n.fit(X_train_n, y_train_n)

In [60]:
# Predict training data

y_train_pred_n = pipe_n.predict(X_train_n)
print(f"Predictions on training data: {y_train_pred_n}")

In [61]:
# Predict test data

y_test_pred_n = pipe_n.predict(X_test_n)
print(f"Predictions on test data: {y_test_pred_n}")

In [62]:
# Check r square value of Random Forest

r2_n = r2_score(y_test_n, y_test_pred_n)
print('r2_n score for a  model is', r2_n)

### Infrence
* Random forest is performing better as compare to linner regression.
* r2 score is too less for making prediction
* G1 and G2 high correlated with Target - using any one one them lead to increase in beterment of model.