# DATA WRANGLING

## Step 1

In [None]:
# Importing necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler
label = LabelEncoder()
onehot=OneHotEncoder(sparse_output=False, handle_unknown='ignore').set_output(transform='pandas')
scaler = StandardScaler()

## Step 2

In [None]:
# Loading the Dataset and rounding numbers to 2 decimal places
dropouts = pd.read_csv('data.csv', delimiter=';')
school_rd = dropouts.drop('Target',axis=1).round(2)
dropouts_concat = pd.concat([school_rd,dropouts['Target']],axis=1)
school = dropouts_concat.copy()
school.head()


In [None]:
school.info()

In [None]:
school.rename(columns={'Nacionality' :'Nationality', 'Daytime/evening attendance\t':'Daytime/evening attendance'},inplace=True)

In [None]:
# Checking the number of rows and columns
school.shape

In [None]:
# Checking the datatypes
print(school.info())

In [None]:
school.iloc[:,:18].describe().round(2)

In [None]:
school['Curricular units 1st sem (without evaluations)'].sort_values().unique()

In [None]:
school['Curricular units 2nd sem (without evaluations)'].sort_values().unique()

In [None]:
school['Gender'].sort_values().unique()

In [None]:
sns.boxplot(school['Curricular units 2nd sem (evaluations)'])
plt.show()

In [None]:
school.loc[school['Application order']==9,'Application order' ]=7
school['Application order'].unique()

In [None]:
school.iloc[:,18:].describe().round(2)

In [None]:
# Checking for missing values
print(school.isna().sum())

In [None]:
# Checking for duplicted rows
school.duplicated().sum()

In [None]:
school['Marital status'].sort_values().unique()

In [None]:
school['Application mode'].sort_values().unique()

In [None]:
school["Previous qualification"].unique()

In [None]:
school['Tuition fees up to date'].unique()

## Step 3

In [None]:
# Encoding the Categorical variables and the imbalanced ones
encode_variables = ['Marital status','Application mode','Target','Application order','Previous qualification','Nationality',"Mother's qualification","Father's qualification","Mother's occupation","Father's occupation",'Course','Curricular units 1st sem (credited)','Curricular units 1st sem (enrolled)','Curricular units 1st sem (evaluations)','Curricular units 1st sem (approved)','Curricular units 1st sem (without evaluations)','Curricular units 2nd sem (credited)','Curricular units 2nd sem (enrolled)','Curricular units 2nd sem (evaluations)','Curricular units 2nd sem (approved)','Curricular units 2nd sem (without evaluations)']

for variable in encode_variables:
    school[f'{variable}_encoded']=label.fit_transform(school[f'{variable}'])


school.describe()

In [None]:
school.describe()

In [None]:
category =  ['Daytime/evening attendance','Displaced','Educational special needs','Debtor','Tuition fees up to date','Gender','Scholarship holder','International','Marital status','Application mode','Target','Application order','Previous qualification','Nationality',"Mother's qualification","Father's qualification","Mother's occupation","Father's occupation",'Course','Curricular units 1st sem (credited)','Curricular units 1st sem (enrolled)','Curricular units 1st sem (evaluations)','Curricular units 1st sem (approved)','Curricular units 1st sem (without evaluations)','Curricular units 2nd sem (credited)','Curricular units 2nd sem (enrolled)','Curricular units 2nd sem (evaluations)','Curricular units 2nd sem (approved)','Curricular units 2nd sem (without evaluations)']
for cat in category:
    school[f'{cat}'] = school[f'{cat}'].astype(str)
    school.info()

## Step 4

# Numerical Column Nomalizing

In [None]:
#Checking the variance of the numeric variables
num = ['Age at enrollment','Previous qualification (grade)','Admission grade', 'Unemployment rate','Inflation rate','GDP','Curricular units 1st sem (grade)','Curricular units 2nd sem (grade)']
numeric_values = school[num]
numeric_values.var()

In [None]:
numeric_values.var()

In [None]:
# Checking the distribution of the numeric variables
for numeric in num:
    plt.boxplot(school[f'{numeric}'])
    plt.title(f'{numeric} Distribution')
    plt.show()

In [None]:
# Checking the distribution of the numeric variables
for numeric in numeric_values:
    school[f'{numeric}'].hist()
    plt.title(f'{numeric} Distribution')
    plt.show()

In [None]:
# Scaling the numeric variables
normalized_values = scaler.fit_transform(numeric_values)
df = pd.DataFrame(normalized_values)
df.columns = numeric_values.columns
for column in df.columns:
    df[f'{column}_encoded'] = df[f'{column}']
df
# df[f'{numeric}_encoded']

In [None]:
# Dropping the unscaled variables
for unit in num:
    df.drop(f'{unit}', axis=1, inplace=True)
df.info()

In [None]:
#Checking the scaled distribution
for numeric in df:
    df[f'{numeric}'].hist()
    plt.title(f'{numeric} Distribution')
    plt.show()

In [None]:
df.var()

In [None]:
# Adding the scaled features
dropout = pd.concat([school,df],axis=1)
dropout.iloc[:,37:].describe()

## Step 5

In [None]:
# Converting the cleaned data to csv format
dropout.to_csv('cleaned_data.csv',index=False)