In [4]:
# Clone the Repository
!git clone https://github.com/FaridRash/HW-SW-B Big-Data

Cloning into 'Big-Data'...
remote: Enumerating objects: 372, done.[K
remote: Counting objects: 100% (156/156), done.[K
remote: Compressing objects: 100% (151/151), done.[K
remote: Total 372 (delta 93), reused 7 (delta 4), pack-reused 216 (from 1)[K
Receiving objects: 100% (372/372), 19.63 MiB | 19.40 MiB/s, done.
Resolving deltas: 100% (188/188), done.


## **The libraries**

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn
import seaborn as sns
import plotly.figure_factory as ff
from statsmodels.tools.tools import add_constant
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from scipy.cluster.hierarchy import dendrogram, linkage
from sklearn.cluster import AgglomerativeClustering
from sklearn.cluster import DBSCAN
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

## **Fetching**

In [3]:
# Load the dataset from the specified path into a pandas DataFrame
data = pd.read_csv('/content/Big-Data/Data/xAPI-Edu-Data.csv')

FileNotFoundError: [Errno 2] No such file or directory: '/content/Big-Data/Data/xAPI-Edu-Data.csv'

In [None]:
# Display the first 10 rows of the DataFrame to get an overview of the data
data.head(10)

In [None]:
# Print the shape of the DataFrame to know the number of rows and columns
print(data.shape)

In [None]:
# Display a concise summary of the DataFrame, including data types and non-null counts
data.info()

In [None]:
# Generate descriptive statistics for the numerical columns in the DataFrame
data.describe()

## **Data Cleaning**

In [None]:
sns.heatmap(data.isnull(), cbar=False, yticklabels=False)

In [None]:
# Extract the column names into a list
columns_list = data.columns.tolist()

# Enumerate through the list of column names and print each with its index
for index, column_name in enumerate(columns_list):
    print(f"{index + 1}. {column_name}")

In [None]:
# Calculate the number of missing values in each column
missing_data = data.isnull().sum()

# Print the missing values for each column
print("Missing values in each column:")
print(missing_data)


In [None]:
# Check if there are any duplicate rows and count the number of duplicate rows
are_duplicates = data.duplicated().any()
num_duplicates = data.duplicated().sum()

# Print the results
are_duplicates, num_duplicates


In [None]:
# Remove duplicate rows from the DataFrame
data.drop_duplicates(inplace=True)


In [None]:
# Recheck if there are any duplicate rows and count the number of duplicate rows after removing them
are_duplicates_after_removal = data.duplicated().any()
num_duplicates_after_removal = data.duplicated().sum()

# Print the results to confirm duplicates have been removed
are_duplicates_after_removal, num_duplicates_after_removal


## **Encoding**

In [None]:
# Select columns with object data type
object_columns = data.select_dtypes(include=['object']).columns.tolist()

# Iterate over each object column and print the unique values
for col_index in range(len(object_columns)):
    col_name = object_columns[col_index]
    unique_values = data[col_name].unique()
    print(f"Unique values for column '{col_name}':")
    for value in unique_values:
        print(value)
    print()


In [None]:
def get_uniques_alternative(df, columns):
    """
    Returns a dictionary of unique values for specified columns in the DataFrame.

    :param df: pandas DataFrame
    :param columns: list of column names
    :return: dictionary with column names as keys and lists of unique values as values
    """
    unique_values_dict = {}
    for column in columns:
        unique_values_dict[column] = df[column].unique().tolist()
    return unique_values_dict


In [None]:
def get_categorical_columns_alternative(df):
    """
    Returns a list of column names that have a data type of 'object'.

    :param df: pandas DataFrame
    :return: list of categorical column names
    """
    categorical_columns = []
    for column in df.columns:
        if df.dtypes[column] == 'object':
            categorical_columns.append(column)
    return categorical_columns


In [None]:
# Get unique values for all categorical columns in the DataFrame
unique_values_dict = get_uniques_alternative(data, get_categorical_columns_alternative(data))

# Display the dictionary of unique values
unique_values_dict


In [None]:
# Display the column names of the DataFrame
data.columns


In [None]:
# Calculate and print the normalized value counts of 'gender' grouped by 'Class'
print(data.groupby(['Class'])['gender'].value_counts(normalize=True), '\n', '\n', '\n')

# Calculate and print the normalized value counts of 'NationalITy' grouped by 'Class'
print(data.groupby(['Class'])['NationalITy'].value_counts(normalize=True), '\n', '\n', '\n')

# Calculate and print the normalized value counts of 'PlaceofBirth' grouped by 'Class'
print(data.groupby(['Class'])['PlaceofBirth'].value_counts(normalize=True), '\n', '\n', '\n')

# Calculate and print the normalized value counts of 'StageID' grouped by 'Class'
print(data.groupby(['Class'])['StageID'].value_counts(normalize=True), '\n', '\n', '\n')


In [None]:
# Categorize the features into binary, ordinal, and nominal categories
binary_features = ['gender', 'Semester', 'Relation', 'ParentAnsweringSurvey', 'ParentschoolSatisfaction', 'StudentAbsenceDays']
ordinal_features = ['StageID', 'GradeID']
nominal_features = ['NationalITy', 'PlaceofBirth', 'SectionID', 'Topic']

# Specify the target column
target_column = 'Class'


In [None]:
# Define the positive values for binary encoding of binary features
binary_positive_values = ['M', 'S', 'Father', 'Yes', 'Good', 'Above-7']


In [None]:
# Define the ordering for the 'StageID' ordinal feature
stage_ordering = ['lowerlevel', 'MiddleSchool', 'HighSchool']

# Define the ordering for the 'GradeID' ordinal feature
grade_ordering = ['G-02', 'G-04', 'G-05', 'G-06', 'G-07', 'G-08', 'G-09', 'G-10', 'G-11', 'G-12']


In [None]:
# Define prefixes for nominal features to be used in encoding
nominal_prefixes = ['N', 'B', 'S', 'T']


In [None]:
# Function to perform binary encoding on a specified column
def binary_encode_alternative(df, column, positive_value):
    df = df.copy()
    df[column] = df[column].map(lambda x: 1 if x == positive_value else 0)
    return df


In [None]:
# Function to perform ordinal encoding on a specified column
def ordinal_encode_alternative(df, column, ordering):
    df = df.copy()
    df[column] = df[column].map(ordering.index)
    return df


In [None]:
# Function to perform one-hot encoding on a specified column
def onehot_encode_alternative(df, column, prefix):
    df = df.copy()
    dummies = pd.get_dummies(df[column], prefix=prefix).astype(int)
    df = df.join(dummies).drop(column, axis=1)
    return df


In [None]:
# Apply binary encoding to each feature in the binary_features list
for feature, positive_value in zip(binary_features, binary_positive_values):
    data = binary_encode_alternative(data, feature, positive_value)


In [None]:
# Apply one-hot encoding to each feature in the nominal_features list
for feature, prefix in zip(nominal_features, nominal_prefixes):
    data = onehot_encode_alternative(data, feature, prefix)


In [None]:
# Apply ordinal encoding to the 'StageID' column
data = ordinal_encode_alternative(data, 'StageID', stage_ordering)

# Apply ordinal encoding to the 'GradeID' column
data = ordinal_encode_alternative(data, 'GradeID', grade_ordering)


In [None]:
# Define the ordering for the target column 'Class'
target_ordering = ['L', 'M', 'H']

# Apply ordinal encoding to the target column
encoded_data = ordinal_encode_alternative(data, target_column, target_ordering)


In [None]:
# Display the first 10 rows of the encoded DataFrame to verify the transformations
encoded_data.head(10)


In [None]:
# Print the shape of the DataFrame after encoding to verify the dimensions
print(data.shape)


In [None]:
# Extract the column names from the encoded DataFrame into a list
columns_list = encoded_data.columns.tolist()

# Enumerate through the list of column names and print each with its index
for index, column_name in enumerate(columns_list):
    print(f"{index + 1}. {column_name}")


In [None]:
# Calculate the number of missing values in each column of the encoded DataFrame
missing_values = encoded_data.isnull().sum()

# Print the missing values for each column to identify any issues
print("Missing Values:\n", missing_values)


In [None]:
# Display the first few rows of the encoded DataFrame to verify the changes
encoded_data.head()


## **Feature Engineering**

In [None]:
# Select relevant features for modeling by dropping the target column 'Class'
numerical_features = encoded_data[['raisedhands', 'VisITedResources', 'AnnouncementsView', 'Discussion']]

# Plot a box plot to visualize the distribution of features before standardization
plt.figure(figsize=(12, 6))
sns.boxplot(data=numerical_features)
plt.title('Box Plot Before Standardization')
plt.xticks(rotation=90)
plt.show()


In [None]:
# Import StandardScaler for feature standardization
from sklearn.preprocessing import StandardScaler

# Standardize the features
scaler = StandardScaler()
scaled_features = scaler.fit_transform(numerical_features)

# Create a DataFrame with the standardized features
scaled_features_df = pd.DataFrame(scaled_features, columns=numerical_features.columns)

# Plot a box plot to visualize the distribution of features after standardization
plt.figure(figsize=(12, 6))
sns.boxplot(data=scaled_features_df)
plt.title('Box Plot After Standardization')
plt.xticks(rotation=90)
plt.show()


In [None]:
# Import necessary library
from statsmodels.stats.outliers_influence import variance_inflation_factor

# List of columns to remove due to low VIF values
low_vif_columns = [ ]

# Remove the columns with low VIF values
final_features_df = numerical_features.drop(columns=low_vif_columns)

# Recalculate VIF for the reduced dataset
vif_final_data = pd.DataFrame()
vif_final_data["feature"] = final_features_df.columns
vif_final_data["VIF"] = [variance_inflation_factor(final_features_df.values, i) for i in range(len(final_features_df.columns))]

# Display final VIF values
print(vif_final_data)


In [None]:
encoded_data['raisedhands'] = numerical_features['raisedhands']
encoded_data['VisITedResources'] = numerical_features['VisITedResources']
encoded_data['AnnouncementsView'] = numerical_features['AnnouncementsView']
encoded_data['Discussion'] = numerical_features['Discussion']

In [None]:
# Check for missing values in the VIF DataFrame
missing_values = encoded_data.isnull().sum()

# Print the number of missing values to ensure data integrity
print("Missing Values:\n", missing_values)


In [None]:
# Display the column names of the DataFrame to verify the current set of features
encoded_data.columns


In [None]:
# Import necessary library for creating annotated heatmap
import plotly.figure_factory as ff

# Identify numerical columns in the DataFrame
numerical_cols = [col for col in encoded_data.columns if encoded_data[col].dtype != 'object']

# Calculate the correlation matrix for the numerical columns
correlation_matrix = encoded_data[numerical_cols].corr()

# Create an annotated heatmap for the correlation matrix
fig = ff.create_annotated_heatmap(
    z=correlation_matrix.to_numpy(),
    x=correlation_matrix.columns.tolist(),
    y=correlation_matrix.columns.tolist(),
    colorscale='Viridis',
    reversescale=True,
    annotation_text=correlation_matrix.round(2).values,
    font_colors=['white', 'black'],
)

# Update the layout of the heatmap for better visualization
fig.update_layout(
    title='Correlation Matrix',
    xaxis_title='Features',
    yaxis_title='Features',
    yaxis_autorange='reversed',
    font=dict(size=10),
    width=1500,
    height=1500
)

# Show the heatmap
fig.show()


In [None]:
encoded_data.drop(['S_A'], axis=1, inplace=True)

In [None]:
# Import necessary library for creating annotated heatmap
import plotly.figure_factory as ff

# Identify numerical columns in the DataFrame
numerical_cols = [col for col in encoded_data.columns if encoded_data[col].dtype != 'object']

# Calculate the correlation matrix for the numerical columns
correlation_matrix = encoded_data[numerical_cols].corr()

# Create an annotated heatmap for the correlation matrix
fig = ff.create_annotated_heatmap(
    z=correlation_matrix.to_numpy(),
    x=correlation_matrix.columns.tolist(),
    y=correlation_matrix.columns.tolist(),
    colorscale='Viridis',
    reversescale=True,
    annotation_text=correlation_matrix.round(2).values,
    font_colors=['white', 'black'],
)

# Update the layout of the heatmap for better visualization
fig.update_layout(
    title='Correlation Matrix',
    xaxis_title='Features',
    yaxis_title='Features',
    yaxis_autorange='reversed',
    font=dict(size=10),
    width=1500,
    height=1500
)

# Show the heatmap
fig.show()

In [None]:
x = encoded_data.drop(['Class'], axis=1)
y = encoded_data['Class']

## **Splitting**

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.20, random_state=42)

## **Naive Bayes**

In [None]:
from sklearn.naive_bayes import GaussianNB
nb = GaussianNB()
nb.fit(x_train, y_train)

In [None]:
y_pred_train_nb = nb.predict(x_train)
y_pred_test_nb = nb.predict(x_test)

In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score

accuracy_train_nb = accuracy_score(y_train, y_pred_train_nb)
accuracy_test_nb = accuracy_score(y_test, y_pred_test_nb)
print("Accuracy on Training Set:", accuracy_train_nb)
print("Accuracy on Test Set:", accuracy_test_nb)

In [None]:
confusion_matrix(y_test, y_pred_test_nb)

In [None]:
p = precision_score(y_test, y_pred_test_nb, average='weighted')
r = recall_score(y_test, y_pred_test_nb, average='weighted')
print("Precision:", p)
print("Recall:", r)

## **KNN**

In [None]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=4)
knn.fit(x_train, y_train)

In [None]:
y_pred_train_knn = knn.predict(x_train)
y_pred_test_knn = knn.predict(x_test)

In [None]:
accuracy_train_knn = accuracy_score(y_train, y_pred_train_knn)
accuracy_test_knn = accuracy_score(y_test, y_pred_test_knn)
print("Accuracy on Training Set:", accuracy_train_knn)
print("Accuracy on Test Set:", accuracy_test_knn)

In [None]:

confusion_matrix(y_test, y_pred_test_knn)

In [None]:
from sklearn.metrics import precision_score, recall_score
p = precision_score(y_test, y_pred_test_knn, average='weighted')
r = recall_score(y_test, y_pred_test_knn, average='weighted')
print("Precision:", p)
print("Recall:", r)

## **Decision Tree**

In [None]:
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier(max_depth=5, min_samples_split=8, min_samples_leaf=4)
dt.fit(x_train, y_train)

In [None]:
y_pred_train_dt = dt.predict(x_train)
y_pred_test_dt = dt.predict(x_test)

In [None]:
accuracy_train_dt = accuracy_score(y_train, y_pred_train_dt)
accuracy_test_dt = accuracy_score(y_test, y_pred_test_dt)
print("Accuracy on Training Set:", accuracy_train_dt)
print("Accuracy on Test Set:", accuracy_test_dt)

## **Random Forest**

In [None]:
from sklearn.ensemble import RandomForestClassifier

rfs = [RandomForestClassifier(n_estimators=150, max_depth=depth, min_samples_split=min_sample, min_samples_leaf=msl, random_state=42) for depth in range(4, 9) for min_sample in range(4, 13) for msl in range(2, 7)]


In [None]:
def t_and_t(model):
  model.fit(x_train, y_train)
  y_pred_train_rf = model.predict(x_train)
  y_pred_test_rf = model.predict(x_test)

  acc_train_rf = accuracy_score(y_train, y_pred_train_rf)
  acc_test_rf = accuracy_score(y_test, y_pred_test_rf)
  if(acc_test_rf > 0.85):
    print(model, acc_train_rf, acc_test_rf)

In [None]:
for model in rfs:
  t_and_t(model)

In [None]:
rf = RandomForestClassifier(n_estimators=150, max_depth=6, min_samples_split=12, min_samples_leaf=2, random_state=42)
rf.fit(x_train, y_train)

In [None]:
y_pred_train_rf = rf.predict(x_train)
y_pred_test_rf = rf.predict(x_test)

In [None]:
acc_train_rf = accuracy_score(y_train, y_pred_train_rf)
acc_test_rf = accuracy_score(y_test, y_pred_test_rf)

In [None]:
print("Accuracy on Training Set:", acc_train_rf)
print("Accuracy on Test Set:", acc_test_rf)

In [None]:
confusion_matrix(y_test, y_pred_test_rf)

In [None]:
p = precision_score(y_test, y_pred_test_rf, average='weighted')
r = recall_score(y_test, y_pred_test_rf, average='weighted')
print("Precision:", p)
print("Recall:", r)

## **SVM**

In [None]:
from sklearn.svm import SVC
svm = SVC(kernel='linear')
svm.fit(x_train, y_train)

In [None]:
y_pred_train_svm = svm.predict(x_train)
y_pred_test_svm = svm.predict(x_test)

In [None]:
acc_train_svm = accuracy_score(y_train, y_pred_train_svm)
acc_test_svm = accuracy_score(y_test, y_pred_test_svm)

In [None]:
print("Accuracy on Training Set:", acc_train_svm)
print("Accuracy on Test Set:", acc_test_svm)

In [None]:
confusion_matrix(y_test, y_pred_test_svm)

In [None]:
p = precision_score(y_test, y_pred_test_svm, average='weighted')
r = recall_score(y_test, y_pred_test_svm, average='weighted')
print("Precision:", p)
print("Recall:", r)

## **Logistic Regression**

In [None]:
from sklearn.linear_model import LogisticRegression
lor = LogisticRegression()
lor.fit(x_train, y_train)

In [None]:
y_pred_train_lor = lor.predict(x_train)
y_pred_test_lor = lor.predict(x_test)

In [None]:
acc_lor_train = accuracy_score(y_train, y_pred_train_lor)
acc_lor_test = accuracy_score(y_test, y_pred_test_lor)

In [None]:
acc_lor_test, acc_lor_train

In [None]:
confusion_matrix(y_test, y_pred_test_lor)

In [None]:
percision_lor_test = precision_score(y_test, y_pred_test_lor, average='weighted')
recall_lor_test = recall_score(y_test, y_pred_test_lor, average='weighted')

percision_lor_test, recall_lor_test

## **ANN**

In [None]:
from sklearn.neural_network import MLPClassifier
ann = MLPClassifier(hidden_layer_sizes=198, max_iter=85)
ann.fit(x_train, y_train)

In [None]:
y_pred_train_ann = ann.predict(x_train)
y_pred_test_ann = ann.predict(x_test)

In [None]:
acc_ann_train = accuracy_score(y_train, y_pred_train_ann)
acc_ann_test = accuracy_score(y_test, y_pred_test_ann)

acc_ann_train, acc_ann_test

## **Random Forest Visualization**

In [None]:
from sklearn.metrics import confusion_matrix

conf_matrix_train = confusion_matrix(y_train, y_pred_train_rf)
conf_matrix_test = confusion_matrix(y_test, y_pred_test_rf)

fig, ax = plt.subplots(1, 2, figsize=(12, 6))

sns.heatmap(conf_matrix_train, annot=True, fmt='d', cmap='Blues', ax=ax[0])
ax[0].set_title('Confusion Matrix - Training Set')
ax[0].set_xlabel('Predicted Labels')
ax[0].set_ylabel('True Labels')

sns.heatmap(conf_matrix_test, annot=True, fmt='d', cmap='Blues', ax=ax[1])
ax[1].set_title('Confusion Matrix - Test Set')
ax[1].set_xlabel('Predicted Labels')
ax[1].set_ylabel('True Labels')

plt.show()

In [None]:
from sklearn.tree import plot_tree

estimator = rf.estimators_[0]

plt.figure(figsize=(20,10))
plot_tree(estimator,
          feature_names=x_train.columns,
          class_names=['0', '1', '2'],
          filled=True,
          rounded=True)
plt.show()


In [None]:
importances = rf.feature_importances_

indices = np.argsort(importances)[::-1]

plt.figure(figsize=(10, 6))
plt.title("Feature Importance")
plt.bar(range(x_train.shape[1]), importances[indices], align="center")
plt.xticks(range(x_train.shape[1]), x_train.columns[indices], rotation=90)
plt.xlim([-1, x_train.shape[1]])
plt.tight_layout()
plt.show()