The goal is to find correlations between how well students do in school and other factors. The data set we will be using is from portugese schools in the subject of math.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import statsmodels.api as sm                          # statsmodels logistic regression
from sklearn.linear_model import LogisticRegression   # sklearn logistic regression
from sklearn import metrics
import seaborn as sn
from sklearn import svm
from sklearn.metrics import accuracy_score, roc_curve, roc_auc_score, confusion_matrix
from sklearn.model_selection import GridSearchCV

In [None]:
math_df = pd.read_csv('student-mat.csv',delimiter = ';')
math_df.head()

In [None]:
plt.scatter(math_df['age'],math_df['absences'],alpha = .2)

In [None]:
def data_scrubber(df,names):
    """
    This function replaces the string values with numerical values.
        Input: Dataframe. A list of column names to turn from strings into intergers.
        Output: Dataframe, where the columns in the 'names' argument are now intergers.
    """
    for name in names:
        values = df[name].unique()
        df[name].replace(values,range(0,values.size),inplace = True)
    return df

In [None]:
numerical_df = math_df.copy()
numerical_df.replace(['no','yes'], [0,1], inplace=True)  #Replace the boolean yes/no values.
numerical_df = data_scrubber(numerical_df,['school','sex','address','famsize','Pstatus','Mjob','Fjob','reason','guardian'])

In [None]:
numerical_df.hist(figsize=(20,20))

In [None]:
numerical_df['G3'] = numerical_df['G3']/20
X = numerical_df[['school','sex','age','address','famsize','Pstatus','Medu','Fedu','Mjob','Fjob','reason','guardian','traveltime','studytime','failures','schoolsup','famsup','paid','activities','nursery','higher','internet','romantic','famrel','freetime','goout','Dalc','Walc','health','absences']]
Y = numerical_df['G3']
x_train,x_test,y_train,y_test = train_test_split(X,Y,train_size=0.75,test_size=0.25,random_state=1)
logit_model = sm.Logit(y_train, sm.add_constant(x_train))
result = logit_model.fit()
print(result.summary())

In [None]:
correlation_df=numerical_df.corr()
correlation_df

In [None]:
#Highest negative correlation: Failures
plt.scatter(numerical_df["failures"],numerical_df["G3"])

In [None]:
#Highest positive correlation: Mother education
plt.scatter(numerical_df["Medu"],numerical_df["G3"])

In [None]:
#finding quantiles for our data set for final grades 
print(numerical_df['G3'].quantile([0.25]))
print(numerical_df['G3'].quantile([0.50]))
print(numerical_df['G3'].quantile([0.75]))

In [None]:
#Changing the quantiles from their percentages to ints making it eaiser to classify
numerical_df2=numerical_df.copy()
numerical_df[numerical_df['G3']<0.4]=1
numerical_df[numerical_df['G3']<=0.55]=2
numerical_df[numerical_df['G3']<=0.7]=3
numerical_df[numerical_df['G3']<=0.99]=4
#getting rid of first and second semester grades because we don't use them
numerical_df=numerical_df.drop(columns=['G1','G2'])


In [None]:
#train test split
train_features, test_features, train_labels, test_labels=train_test_split(numerical_df.drop(columns=['G3']),numerical_df['G3'])

In [None]:
model = svm.SVC(C=10,kernel='linear')
model_fit=model.fit(train_features,train_labels)
predict=model_fit.predict(test_features)
print(accuracy_score(predict, test_labels))
print(confusion_matrix(test_labels, predict))



In [None]:
#too easy, going to try to calculate final percentage instead of just quantile

In [None]:
numerical_df2=numerical_df2.drop(columns=['G1','G2'])

In [None]:
numerical_df2['G3']=numerical_df2['G3']*100


In [None]:
train_features, test_features, train_labels, test_labels=train_test_split(numerical_df2.drop(columns=['G3']),numerical_df2['G3'])

In [None]:
model = svm.SVC(C=10,kernel='linear')
model_fit=model.fit(train_features,train_labels.astype(int))
predict=model_fit.predict(test_features)
print(accuracy_score(predict, test_labels.astype(int)))


In [None]:
#features with p values under 0.3
new_df=numerical_df2[['G3','failures','famsize','schoolsup','romantic']]

In [None]:
train_features, test_features, train_labels, test_labels=train_test_split(new_df.drop(columns=['G3']),new_df['G3'])

In [None]:
model = svm.SVC(C=10,kernel='linear')
model_fit=model.fit(train_features,train_labels.astype(int))
predict=model_fit.predict(test_features)
print(accuracy_score(predict, test_labels.astype(int)))


In [None]:
param_grid = {'C': [1e3, 5e3, 1e4, 5e4, 1e5],
              'gamma': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.1], }
# make a classifier by searching over a classifier and the parameter grid
clf = GridSearchCV(svm.SVC(kernel='linear', class_weight='balanced'), param_grid)

# we have a "good" classifier (according to GridSearchCV), how's it look
clf = clf.fit(train_features, train_labels.astype(int))
print("Best estimator found by grid search:")
print(clf.best_estimator_)



In [None]:
train_features, test_features, train_labels, test_labels=train_test_split(numerical_df2.drop(columns=['G3']),numerical_df2['G3'])

In [None]:
model = svm.SVC(C=1000,kernel='linear',gamma=0.0001,class_weight='balanced')
model_fit=model.fit(train_features,train_labels.astype(int))
predict=model_fit.predict(test_features)
print(accuracy_score(predict, test_labels.astype(int)))