<a href="https://colab.research.google.com/github/AshMurali77/college-graduation-predictor/blob/main/MIS_373_Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

###Import packages and load dataset

In [None]:
import pandas as pd
import seaborn as sns
import numpy as np
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt


pd.set_option("display.max_columns", None)
df = pd.read_csv('/content/sample_data/data.csv', sep=';')
df

FileNotFoundError: [Errno 2] No such file or directory: '/content/sample_data/data.csv'

###Preprocessing

####Inspecting Dataset


In [None]:
df.isnull().sum()
df.info()

####Round Floats and convert to ints

In [None]:
df = df.round()
df[['Admission grade', 'Previous qualification (grade)', 'Curricular units 1st sem (grade)', 'Curricular units 2nd sem (grade)', 'Unemployment rate', 'Inflation rate', 'GDP']] = df[['Admission grade', 'Previous qualification (grade)', 'Curricular units 1st sem (grade)', 'Curricular units 2nd sem (grade)', 'Unemployment rate', 'Inflation rate', 'GDP']].astype(np.int64)
df.info()

####Dropping Specific Columns

In [None]:
df = df.drop(columns=['Application mode', 'Application order', 'International', 'Debtor', 'Marital status', 'Displaced', 'Nacionality', 'Father\'s qualification', 'Mother\'s qualification', 'Father\'s occupation', 'Father\'s qualification', 'Mother\'s occupation', 'Mother\'s qualification', 'Unemployment rate', 'Inflation rate', 'GDP'], axis=1)
df.info()

####Modifying target column so its numeric

In [None]:
df = df.replace({ 'Target' : { 'Dropout' : 0, 'Graduate' : 1, 'Enrolled' : 1 } })
df.info()

####Data Visualization

####Data Correlation

In [None]:
correlations = df.corr()
correlations

In [None]:
# Remove the 'Target' correlation with itself before sorting
correlations = correlations.drop('Target', axis=0)['Target'].abs()

# Sort by absolute value and select the top 10
top_features = correlations.sort_values(ascending=False).head(10)

# Create a bar plot for these top features
sns.barplot(x=top_features.values, y=top_features.index, palette='viridis')
plt.title('Top 10 Features by Absolute Correlation with Target')
plt.xlabel('Absolute Correlation')
plt.ylabel('Features')
plt.show()

In [None]:
# Set the aesthetic style of the plots
sns.set_style('whitegrid')

# Adjust figure size right before creating the plot to ensure it applies
plt.figure(figsize=(12, 8))

# Create a count plot
ax = sns.countplot(x='Target', hue='Target', data=df, palette='viridis')

# Add a title and labels with increased font sizes for clarity
plt.title('Count of Target', fontsize=16)
plt.xlabel('Dropout Vs. Graduated', fontsize=14)
plt.ylabel('Count', fontsize=14)

# Move the legend to a better position if needed
plt.legend(title='Target', title_fontsize='13', fontsize='12', loc='upper right')

# Show the plot
plt.show()


####Data Viz with respect to gender

In [None]:
sns.set(rc={'figure.figsize':(12, 8)})

sns.countplot(x=df['Gender'], hue=df['Target'])

####Same thing with respect to scholarship status

In [None]:
# Set the aesthetic style of the plots
sns.set_style('whitegrid')

# Adjust figure size right before creating the plot to ensure it applies
plt.figure(figsize=(12, 8))

# Create a count plot
ax = sns.countplot(x='Scholarship holder', hue='Target', data=df, palette='viridis')

# Add a title and labels with increased font sizes for clarity
plt.title('Count of Participants by Scholarship holder, Grouped by Target', fontsize=16)
plt.xlabel('Scholarship holder', fontsize=14)
plt.ylabel('Count', fontsize=14)

# Move the legend to a better position if needed
plt.legend(title='Target', title_fontsize='13', fontsize='12', loc='upper right')

# Show the plot
plt.show()


####Age

In [None]:
# Set the aesthetic style of the plots
sns.set_style('whitegrid')

# Adjust figure size right before creating the plot to ensure it applies
plt.figure(figsize=(12, 8))

# Create a count plot
ax = sns.countplot(x='Age at enrollment', hue='Target', data=df, palette='viridis')

# Improve legibility by rotating x-axis labels if they overlap
plt.xticks(rotation=45)

# Add a title and labels with increased font sizes for clarity
plt.title('Count of Participants by Age at Enrollment, Grouped by Target', fontsize=16)
plt.xlabel('Age at Enrollment', fontsize=14)
plt.ylabel('Count', fontsize=14)

# Move the legend to a better position if needed
plt.legend(title='Target', title_fontsize='13', fontsize='12', loc='upper right')

# Show the plot
plt.show()


####We can visualize more as we see fit

###Model Building

####Train/Test Setup

In [None]:
from sklearn.model_selection import train_test_split
#features
x = df.drop(columns='Target', axis=1)
#label
y = df['Target']

#Split into training and testing (70/30 split)
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=2)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

####Logistic Regression training accuracy NOTE THAT WE REACHED MAX ITERATIONS HERE

In [None]:
from sklearn.linear_model import LogisticRegression
import seaborn as sns

lr = LogisticRegression()
lr.fit(X_train, y_train)
traininig_pred_lr = lr.predict(X_train)
lr_train_acc = accuracy_score(y_train, traininig_pred_lr)
print('Training accuracy for decision tree:', lr_train_acc)

####Logistic Regression testing accuracy

In [None]:
testing_pred_lr = lr.predict(X_test)
lr_test_acc = accuracy_score(y_test, testing_pred_lr)
print('Testing accuracy for logistic regression:', lr_test_acc)

### Decision Tree training accuracy

In [None]:
from sklearn.tree import DecisionTreeClassifier

dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)
traininig_pred_dt = dt.predict(X_train)
dt_train_acc = accuracy_score(y_train, traininig_pred_dt)
print('Training accuracy for decision tree:', dt_train_acc)

In [None]:
from sklearn import tree

# plot of decision tree
tree.plot_tree(dt, filled=True, feature_names=X_train.columns, class_names=['Graduated', 'Dropped Out'])

#### Decision Tree testing accuracy

In [None]:
testing_pred_dt = dt.predict(X_test)
dt_test_acc = accuracy_score(y_test, testing_pred_dt)
print('Testing accuracy for decision tree:', dt_test_acc)

####Random Forest training accuracy

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()
rf.fit(X_train, y_train)
traininig_pred_rf = rf.predict(X_train)
rf_train_acc = accuracy_score(y_train, traininig_pred_rf)
print('Training accuracy for random forest:', rf_train_acc)

####Random Forest testing accuracy

In [None]:
testing_pred_rf = rf.predict(X_test)
rf_test_acc = accuracy_score(y_test, testing_pred_rf)
print('Testing accuracy for random forest:', rf_test_acc)

###Model Evaluation and Feature Importance

#### Accuracies

In [None]:
acc_df = pd.DataFrame({
    'Model Name' : ['Logistic Regression', 'Decision Tree', 'Random Forest'],
    'Training Accuracy': [lr_train_acc, dt_train_acc, rf_train_acc],
    'Testing Accuracy': [lr_test_acc, dt_test_acc, rf_test_acc],

})