# Social 4 

## Goal: Find correlations between how well students do in school and other factors. The data set we will be using is from portugese schools in the subject of math.

### Group Members:  Christine Asai, Alder Futon, Ashley Francis, Brooke Schmidt, Izaan Shaikh

### Imports

In [None]:
import pandas as pd
import seaborn as sns
sns.set(style="ticks")
sns.set(rc={'figure.figsize':(15,10)})
tips = sns.load_dataset("tips")
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import statsmodels.api as sm                          # statsmodels logistic regression
from sklearn.linear_model import LogisticRegression   # sklearn logistic regression
from sklearn import metrics
import seaborn as sn
from sklearn import svm
from sklearn.metrics import accuracy_score, roc_curve, roc_auc_score, confusion_matrix
from sklearn.model_selection import GridSearchCV

### Reading our Original Dataframe 
#### Student Performance Data Set taken from two Portuguese Secondary Education Schools

In [None]:
math_df = pd.read_csv('student-mat.csv',delimiter = ';')
math_df.head()

In [None]:
plt.scatter(math_df['age'],math_df['absences'],alpha = .2)

### Making Function: data_scrubber
####   This function replaces the string values with numerical values. 
    Input: Dataframe. A list of column names to turn from strings into intergers.
    Output: Dataframe, where the columns in the 'names' argument are now intergers.

In [None]:
def data_scrubber(df,names):
    for name in names:
        values = df[name].unique()
        df[name].replace(values,range(0,values.size),inplace = True)
    return df

### Replacing the Boolean no/yes Values with 0 and 1 Respectively 

In [None]:
numerical_df = math_df.copy()
numerical_df.replace(['no','yes'], [0,1], inplace=True)  #Replace the boolean yes/no values.
numerical_df = data_scrubber(numerical_df,['school','sex','address','famsize','Pstatus','Mjob','Fjob','reason','guardian'])

### Making Histograms of All Features

In [None]:
numerical_df.hist(figsize=(20,20))

### Turning 1-20 Grade Rankings into 0-1 Rankings

In [None]:
numerical_df['G3'] = numerical_df['G3']/20

### Performing Logit Model

In [None]:
X = numerical_df[['school','sex','age','address','famsize','Pstatus','Medu','Fedu','Mjob','Fjob','reason','guardian','traveltime','studytime','failures','schoolsup','famsup','paid','activities','nursery','higher','internet','romantic','famrel','freetime','goout','Dalc','Walc','health','absences']]
Y = numerical_df['G3']
x_train,x_test,y_train,y_test = train_test_split(X,Y,train_size=0.75,test_size=0.25,random_state=1)
logit_model = sm.Logit(y_train, sm.add_constant(x_train))
result = logit_model.fit()
print(result.summary())

### Looking at Correlation Between Features
#### Specifically Looking at Each Features Correlation with G3

In [None]:
correlation_df=numerical_df.corr()
correlation_df["G3"]

In [None]:
plt.rcParams['figure.figsize'] = (25, 20)
plt.title('Coorelations with G3')
sns.heatmap(correlation_df, cmap = 'Blues', annot = False)
plt.show()

### *Failures* had the highest negative correlation with G3
#### Below is a scatter plot to get more insight to this result

In [None]:
plt.scatter(numerical_df["failures"],numerical_df["G3"])

### *Mother Education* had the highest positive correlation with G3
#### Below is a scatter plot to get more insight to this result

In [None]:
plt.scatter(numerical_df["Medu"],numerical_df["G3"])

### Finding Quantiles
#### We thought an easy way to approach this would be to only use 4 labels, to do this we found the quantiles so we knew where to seperate the final grades

In [None]:
print(numerical_df['G3'].quantile([0.25]))
print(numerical_df['G3'].quantile([0.50]))
print(numerical_df['G3'].quantile([0.75]))

#### Saving a copy of the data frame to use it later.

In [None]:
numerical_df2=numerical_df.copy()

### Cleaning up the Data
#### Getting rid of first and second semester grades because we don't need to use them

In [None]:
numerical_df=numerical_df.drop(columns=['G1','G2'])
numerical_df

In [None]:
def quantile_accuracy(df3,N):
    """
    This function takes in an already cleaned dataframe and tests how accurately
    it can predict students that fall into N quantiles
        Inputs: df, a dataframe. N, an interger.
        Outputs: The accuracy score.
    """
    df2 = df3.sort_values(by = ['G3'])
    df = df2['G3']

    L = df.size
    step = L/N
    for i in range(N):
        df.iloc[round((i)*step):round((i+1)*step)] = i
    
    
    train_features, test_features, train_labels, test_labels = train_test_split(df2.drop(columns=['G3']),df)
    model = svm.SVC(C=10,kernel='linear')
    
    model_fit=model.fit(train_features,train_labels.astype(int))
    predict=model_fit.predict(test_features)
    return accuracy_score(predict, test_labels.astype(int))

In [None]:
y = []
for i in range(2,10,1):
    y.append(quantile_accuracy(numerical_df,i))
plt.plot(range(2,10,1),y)

### Train Test Split

In [None]:
train_features, test_features, train_labels, test_labels=train_test_split(numerical_df.drop(columns=['G3']),numerical_df['G3'])

### Making Model

In [None]:
model = svm.SVC(C=10,kernel='linear')
model_fit=model.fit(train_features,train_labels.astype(int))
predict=model_fit.predict(test_features)
print(accuracy_score(predict, test_labels.astype(int)))
print(confusion_matrix(test_labels.astype(int), predict))

## Trial 2
### The previous attempt was too easy, we're going to try to calculate final percentage instead of just quantile

### Cleaning up the Data
#### Getting rid of first and second semester grades because we don't need to use them

In [None]:
numerical_df2=numerical_df2.drop(columns=['G1','G2'])

#### Multiplying by 100 to turn the values into whole nubers

In [None]:
numerical_df2['G3']=numerical_df2['G3']*100

### Train Test Split

In [None]:
train_features, test_features, train_labels, test_labels=train_test_split(numerical_df2.drop(columns=['G3']),numerical_df2['G3'])

### Making Model
#### This model is obviously not as good as our previous trial, but it still works.  If this was totally randomized the accuracey score would be 5.5%, so our method does have some predictive power

In [None]:
model = svm.SVC(C=10,kernel='linear')
model_fit=model.fit(train_features,train_labels.astype(int))
predict=model_fit.predict(test_features)
print(accuracy_score(predict, test_labels.astype(int)))


## Trial 3
### We want to try and improve on trial 2 while still using final percentages as labels.  In this trial we will only use features with low p values.

### Features with p values under 0.3

In [None]:
new_df=numerical_df2[['G3','failures','famsize','schoolsup','romantic']]

### Train Test Split

In [None]:
train_features, test_features, train_labels, test_labels=train_test_split(new_df.drop(columns=['G3']),new_df['G3'])

### Making Model
#### The results of this trial were very similar to trial 2

In [None]:
model = svm.SVC(C=10,kernel='linear')
model_fit=model.fit(train_features,train_labels.astype(int))
predict=model_fit.predict(test_features)
print(accuracy_score(predict, test_labels.astype(int)))


#### Another test with features that have a coorelation of over 0.1 with G3

In [None]:
newdf=numerical_df2[["G3","sex", "age", "address", "Medu", "Fedu", "Mjob", "traveltime", "failures", "paid", "higher", "romantic", "goout"]]

#### Train, Test, Split

In [None]:
train_features, test_features, train_labels, test_labels=train_test_split(newdf.drop(columns=['G3']),newdf['G3'])

### Making Model: 

#### The results were similar to trial 2 

In [None]:
model = svm.SVC(C=10,kernel='linear')
model_fit=model.fit(train_features,train_labels.astype(int))
predict=model_fit.predict(test_features)
print(accuracy_score(predict, test_labels.astype(int)))


## Trial 4
### We want to retry trial 3, but with better parameters

### Using param_grid to find the best estimator

In [None]:
param_grid = {'C': [1e3, 5e3, 1e4, 5e4, 1e5],
              'gamma': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.1], }
# make a classifier by searching over a classifier and the parameter grid
clf = GridSearchCV(svm.SVC(kernel='linear', class_weight='balanced'), param_grid)

# we have a "good" classifier (according to GridSearchCV), how's it look
clf = clf.fit(train_features, train_labels.astype(int))
print("Best estimator found by grid search:")
print(clf.best_estimator_)



### Train Test Split

In [None]:
train_features, test_features, train_labels, test_labels=train_test_split(numerical_df2.drop(columns=['G3']),numerical_df2['G3'])

### Making Model
#### The results were still similar to trial 2 &3

In [None]:
model = svm.SVC(C=1000,kernel='linear',gamma=0.0001,class_weight='balanced')
model_fit=model.fit(train_features,train_labels.astype(int))
predict=model_fit.predict(test_features)
print(accuracy_score(predict, test_labels.astype(int)))