In [1]:
# Loading the files.

import pandas as pd

train_data_url = 'https://raw.githubusercontent.com/WHPAN0108/BHT-DataScience-S23/main/classification/data/Assigment/aug_train.csv'
test_data_url = 'https://raw.githubusercontent.com/WHPAN0108/BHT-DataScience-S23/main/classification/data/Assigment/aug_test.csv'

train_df = pd.read_csv(train_data_url)
test_df = pd.read_csv(test_data_url)

# Display the first few rows of the training data
print("First few rows of the training data:")
print(train_df.head())

# Display the first few rows of the test data
print("First few rows of the test data:")
print(test_df.head())

First few rows of the training data:
   city_development_index  gender      relevent_experience  \
0                   0.624    Male   No relevent experience   
1                   0.926    Male  Has relevent experience   
2                   0.920    Male  Has relevent experience   
3                   0.624    Male   No relevent experience   
4                   0.920  Female  Has relevent experience   

  enrolled_university education_level major_discipline experience  \
0       no_enrollment     High School              NaN          5   
1       no_enrollment        Graduate             STEM        >20   
2       no_enrollment        Graduate             STEM        >20   
3    Full time course     High School              NaN          1   
4       no_enrollment         Masters             STEM        >20   

    company_type last_new_job  training_hours  target  
0            NaN        never              21       0  
1            NaN           >4              12       0  
2  Publ

In [2]:
# Task1 Data clean, imputation
# 1. in experience, replace >20 to 21; <1 to 1, and convert this as a numerical column
# 2. in last_new_job, replace >4 to 5; never to 0, and convert this as a numerical column
# 3. If the column is categorical, impute the missing value as its mode. If the column is numerical, impute the missing value as its median

import pandas as pd

# Function to clean the 'experience' and 'last_new_job' columns
def clean_experience_and_last_new_job(df):
    # Replace '>20' with 21 and '<1' with 1 in the 'experience' column and convert to float
    df['experience'] = df['experience'].replace({'>20': 21, '<1': 1}).astype(float)
    # Replace '>4' with 5 and 'never' with 0 in the 'last_new_job' column and convert to float
    df['last_new_job'] = df['last_new_job'].replace({'>4': 5, 'never': 0}).astype(float)
    return df

# Apply the cleaning to the training and test data
train_df = clean_experience_and_last_new_job(train_df)
test_df = clean_experience_and_last_new_job(test_df)

# Function to impute missing values
def impute_missing_values(df):
    for column in df.columns:
        if df[column].dtype == 'object':
            # If the column is categorical, impute missing values with the mode
            df[column].fillna(df[column].mode()[0], inplace=True)
        else:
            # If the column is numerical, impute missing values with the median
            df[column].fillna(df[column].median(), inplace=True)
    return df

# Apply the imputation to the training and test data
train_df = impute_missing_values(train_df)
test_df = impute_missing_values(test_df)

# Display the first few rows of the cleaned training data
print("First few rows of the cleaned training data:")
print(train_df.head())

# Display the first few rows of the cleaned test data
print("First few rows of the cleaned test data:")
print(test_df.head())

# Check for any remaining missing values in the training data
print("Missing values in the training data:")
print(train_df.isnull().sum())

# Check for any remaining missing values in the test data
print("Missing values in the test data:")
print(test_df.isnull().sum())


First few rows of the cleaned training data:
   city_development_index  gender      relevent_experience  \
0                   0.624    Male   No relevent experience   
1                   0.926    Male  Has relevent experience   
2                   0.920    Male  Has relevent experience   
3                   0.624    Male   No relevent experience   
4                   0.920  Female  Has relevent experience   

  enrolled_university education_level major_discipline  experience  \
0       no_enrollment     High School             STEM         5.0   
1       no_enrollment        Graduate             STEM        21.0   
2       no_enrollment        Graduate             STEM        21.0   
3    Full time course     High School             STEM         1.0   
4       no_enrollment         Masters             STEM        21.0   

    company_type  last_new_job  training_hours  target  
0        Pvt Ltd           0.0              21       0  
1        Pvt Ltd           5.0              12 

In [3]:
# Task2 Classification
# 1. Build a classification model from the training set ( you can use any algorithms)
# 2. generate the confusion matrix and calculate the accuracy, precision, recall, and F1-score on training set. 
# 3. Applying the model in the test set and generating the prediction
# 4. generate the confusion matrix from the test set and calculate the accuracy, precision, recall, and F1-score

import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import LabelEncoder

# Function to encode categorical columns into numerical values
def encode_categorical_columns(df):
    le = LabelEncoder()
    for column in df.select_dtypes(include=['object']).columns:
        df[column] = le.fit_transform(df[column])
    return df

# Clean and transform the training and test data
train_df_clean = encode_categorical_columns(train_df.copy())
test_df_clean = encode_categorical_columns(test_df.copy())

# Separate features and target variable from the training data
X_train = train_df_clean.drop('target', axis=1)
y_train = train_df_clean['target']

# Separate features and target variable from the test data
X_test = test_df_clean.drop('target', axis=1)
y_test = test_df_clean['target']

# Create and train a Random Forest classifier from the training data
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

# Make predictions on the training set
y_train_pred = model.predict(X_train)

# Calculate confusion matrix and metrics for the training set
conf_matrix_train = confusion_matrix(y_train, y_train_pred)
accuracy_train = accuracy_score(y_train, y_train_pred)
precision_train = precision_score(y_train, y_train_pred)
recall_train = recall_score(y_train, y_train_pred)
f1_train = f1_score(y_train, y_train_pred)

# Display the results for the training set
print("Confusion Matrix (Training Set):")
print(conf_matrix_train)
print(f"Accuracy (Training Set): {accuracy_train:.4f}")
print(f"Precision (Training Set): {precision_train:.4f}")
print(f"Recall (Training Set): {recall_train:.4f}")
print(f"F1-Score (Training Set): {f1_train:.4f}")

# Make predictions on the test set
y_test_pred = model.predict(X_test)

# Calculate confusion matrix and metrics for the test set
conf_matrix_test = confusion_matrix(y_test, y_test_pred)
accuracy_test = accuracy_score(y_test, y_test_pred)
precision_test = precision_score(y_test, y_test_pred)
recall_test = recall_score(y_test, y_test_pred)
f1_test = f1_score(y_test, y_test_pred)

# Display the results for the test set
print("\nConfusion Matrix (Test Set):")
print(conf_matrix_test)
print(f"Accuracy (Test Set): {accuracy_test:.4f}")
print(f"Precision (Test Set): {precision_test:.4f}")
print(f"Recall (Test Set): {recall_test:.4f}")
print(f"F1-Score (Test Set): {f1_test:.4f}")

# Adding the predictions to the test dataframe
test_df['predictions'] = y_test_pred

# Display the test dataframe with predictions
test_df




Confusion Matrix (Training Set):
[[1565    0]
 [   2  533]]
Accuracy (Training Set): 0.9990
Precision (Training Set): 1.0000
Recall (Training Set): 0.9963
F1-Score (Training Set): 0.9981

Confusion Matrix (Test Set):
[[73  5]
 [16  6]]
Accuracy (Test Set): 0.7900
Precision (Test Set): 0.5455
Recall (Test Set): 0.2727
F1-Score (Test Set): 0.3636


Unnamed: 0,city_development_index,gender,relevent_experience,enrolled_university,education_level,major_discipline,experience,company_type,last_new_job,training_hours,target,predictions
0,0.624,Male,Has relevent experience,Full time course,Graduate,Other,3.0,Pvt Ltd,1.0,134,0,1
1,0.920,Female,No relevent experience,no_enrollment,Graduate,STEM,5.0,Early Stage Startup,1.0,34,1,0
2,0.767,Male,Has relevent experience,Full time course,Graduate,STEM,10.0,Pvt Ltd,2.0,90,0,0
3,0.910,Male,No relevent experience,no_enrollment,High School,STEM,10.0,Pvt Ltd,0.0,42,0,0
4,0.624,Male,Has relevent experience,Part time course,Graduate,STEM,3.0,Pvt Ltd,1.0,198,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...
95,0.698,Male,Has relevent experience,no_enrollment,Graduate,STEM,7.0,Pvt Ltd,0.0,139,0,0
96,0.926,Male,No relevent experience,no_enrollment,Masters,STEM,10.0,Pvt Ltd,2.0,45,1,0
97,0.920,Male,Has relevent experience,no_enrollment,Masters,STEM,7.0,Pvt Ltd,1.0,22,0,0
98,0.939,Male,No relevent experience,Full time course,High School,STEM,7.0,Pvt Ltd,1.0,182,0,0


In [None]:
# 5. compare the results between the training and test set

# Answer: The model performs nearly perfectly on the training set, as shown by accuracy and confusion matrix, having 0 false positives and only 2 false negatives.
# However, its performance on the test set is significantly worse, suggesting that it does not generalize well to unseen data, indicating potential overfitting,
# which also makes sense, since the first test (for the training set) was based on predictions, the model made, for the same data set it learned from.

# There also is a notable difference in recall and precision between the training and test sets. In the training set, both metrics are very high, whereas in the test set, they are considerably lower.
# Particularly, the recall is very low in the test set, indicating that the model fails to correctly identify many true positives (individuals looking for a job change).

# The F1-score is also much lower in the test set compared to the training set. Further demonstrating, that the model performs poorly in terms of both precision and recall on the test set.

In [None]:
# Extra points: think about what kind of the method can increase the performance (does not need to run )

# Answer: For once, implementing cross-validation techniques is always a good start, in order to try and increase the performance of the model and also help prevent overfitting.
# This involves further splitting the data into multiple subsets and training the model on different combinations of these subsets, or adjusting the percentage/proportion of the training/testing split, 
# in order to find out, which subset results in the best model performance.

# Another effective method of counteracting overfitting and struggles with generalization, which could potentially increase performance, would be ensemble learning with a focus on model averaging.
# Ensemble methods often lead to higher predictive accuracy compared to single models, especially when the individual models perform well on different aspects of the data or capture different patterns.
# So instead of relying on a single Random Forest classifier, ensemble learning involves training multiple models (e.g. different algorithms or variations of the same algorithm) on the same data.
# By then averaging predictions from multiple models, ensemble methods can help reduce variance and improve generalization.
# This is particularly beneficial when individual models tend to overfit on the training data, as averaging their predictions can smooth out biases and errors.