In [None]:
import pandas as pd
import re
import textstat
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

In [None]:
df_orig = pd.read_csv('./full_gpt_2.csv')
df_orig.info()

In [None]:
#create a copy of the original dataframe
df = df_orig.copy()

In [None]:
df = df.iloc[:,1:]
df.dropna(inplace=True)
#df.columns

# Grouping status type


In [None]:
df['Status'] = df['Status'].apply(lambda x: x.lower())
#df['Status'] = df['Status'].apply(lambda x: 'ER' if re.search('memory.*limit', x) else x)
#df['Status'] = df['Status'].apply(lambda x: 'ER' if re.search('time.*limit', x) else x)
df['Status'] = df['Status'].apply(lambda x: 'ER' if re.search('run.*time', x) else x)
df['Status'] = df['Status'].apply(lambda x: 'WA' if re.search('wrong.*answer', x) else x)
df['Status'] = df['Status'].apply(lambda x: 'AC' if re.search('accepted', x) else x)
df['Status'] = df['Status'].apply(lambda x: 'MLE' if re.search('memory.*limit', x) else x)
df['Status'] = df['Status'].apply(lambda x: 'TLE' if re.search('time.*limit', x) else x)
#df['Status'] = df['Status'].apply(lambda x: 'RTE' if re.search('run.*time', x) else x)
df['Status'].unique()

# Textstat Features

In [None]:
#'DCR','textStd','FKG','SMOG','CLI','DFFW','LINSEAR','GF'
#Using TextStat to calculate readability scores
df['ARI'] = df['Description'].apply(lambda x: textstat.automated_readability_index(x))
df['DCR'] = df['Description'].apply(lambda x: textstat.dale_chall_readability_score(x))
# the Flesch Reading Ease Score
df['FRE'] = df['Description'].apply(lambda x: textstat.flesch_reading_ease(x))
# Sentence Count
df['SenCount'] = df['Description'].apply(lambda x: textstat.sentence_count(x))
# Lexicon Count
df['LexCount'] = df['Description'].apply(lambda x: textstat.lexicon_count(x))
# Readability Consensus based upon all the above tests (text standard)
df['textStd'] = df['Description'].apply(lambda x: textstat.text_standard(x, float_output=True))
df['FKG'] = df['Description'].apply(lambda x: textstat.flesch_kincaid_grade(x))
df['SMOG'] = df['Description'].apply(lambda x: textstat.smog_index(x))
df['CLI'] = df['Description'].apply(lambda x: textstat.coleman_liau_index(x))
df['DFFW'] = df['Description'].apply(lambda x: textstat.difficult_words(x))
df['LINSEAR'] = df['Description'].apply(lambda x: textstat.linsear_write_formula(x))
df['GF'] = df['Description'].apply(lambda x: textstat.gunning_fog(x))
# Test_diff = Total - Pass
df['Unpass'] = df['Total'] - df['Pass']

# Tokenization Features

In [None]:
import numpy as np
import nltk
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from gensim.models import Word2Vec
from nltk.corpus import stopwords
import string


nltk.download('punkt')

# Initialize nltk resources
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    text = text.lower()
    text = ''.join([char for char in text if char not in string.punctuation])
    words = nltk.word_tokenize(text)
    words = [word for word in words if word not in stop_words]
    return words

df_w2v = df.copy()

# Word Embeddings using Word2Vec
df_w2v['tokens'] = [preprocess_text(doc) for doc in df_w2v.Description]

wv_model = Word2Vec(df_w2v['tokens'], vector_size=100, window=5, min_count=1, workers=4)

df_w2v['word2vec'] = df_w2v['tokens'].apply(lambda tokens: np.mean([wv_model.wv[token] for token in tokens if token in wv_model.wv], axis=0))

# Split the Word2Vec features into separate columns
word2vecDFrame = pd.DataFrame(df_w2v['word2vec'].to_list(), columns=[f'word2vec_{i}' for i in range(100)])
df_w2v = df_w2v.drop(['tokens','word2vec'], axis = 1)
df_w2v.reset_index(drop=True, inplace=True)
word2vecDFrame.reset_index(drop=True, inplace=True)
df_w2v = pd.merge(df_w2v, word2vecDFrame, left_index=True, right_index=True)


# Reversed engineering

In [None]:
from gensim.models import Word2Vec

# Load the previously trained model
model = wv_model

# Create a list of column names
column_names = ['word2vec_' + str(i) for i in range(0,100)]

for index, row in df_w2v[column_names].iterrows():
    #print(row)
    similar_words = model.wv.similar_by_vector(row.values, topn=20)
    if (index == 93):
        print(similar_words)


# Bag of Words

In [None]:
# Word Frequency: Bag of Words
vectorizer = CountVectorizer(analyzer=preprocess_text)
bow_matrix = vectorizer.fit_transform(df.Description)
bow_matrix_df = pd.DataFrame(bow_matrix.toarray(), columns=vectorizer.get_feature_names_out())

# Sum the counts of each word across all documents
word_counts = bow_matrix_df.sum(axis=0)

# Sort the words by their counts in descending order
sorted_word_counts = word_counts.sort_values(ascending=False)

# Select the top N words
N = 100  
top_words = sorted_word_counts[:N].index

top_words_bow_df = bow_matrix_df[top_words]

#print(top_words)
df_bow = df.copy()

df_bow.reset_index(drop=True, inplace=True)
top_words_bow_df.reset_index(drop=True, inplace=True)
df_bow = pd.merge(df_bow, top_words_bow_df, left_index=True, right_index=True)


# TF-IDF

In [None]:
# Term frequency: TF-IDF
tfidf_vectorizer = TfidfVectorizer(analyzer=preprocess_text)
tfidf_matrix = tfidf_vectorizer.fit_transform(df.Description)
tfidf_matrix_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf_vectorizer.get_feature_names_out())

# For each training sample, get the top N features
N = 100  
top_features = tfidf_matrix_df.apply(lambda row: row.nlargest(N).index, axis=1)

#print(top_features)

# Convert the top features into a DataFrame
top_features_df = pd.DataFrame(top_features.tolist(), columns=[f'top_{i+1}' for i in range(N)])

top_features_scores_tfidf_df = pd.DataFrame()

# For each document
for i in range(len(top_features_df)):
    # For each of the top N words
    for j in range(N):
        word = top_features_df.iloc[i, j]
        score = tfidf_matrix_df.loc[i, word]
        top_features_scores_tfidf_df.loc[i, f'top_{j+1}'] = score

#print(top_features_scores_df)

df_tfidf = df.copy()
df_tfidf.reset_index(drop=True, inplace=True)
top_features_scores_tfidf_df.reset_index(drop=True, inplace=True)
df_tfidf = pd.merge(df_tfidf, top_features_scores_tfidf_df, left_index=True, right_index=True)
#df_tfidf.shape

# All tokenized features into one DF

In [None]:
df_tokenized = df.copy()
#df_tokenized = pd.concat([df_tokenized, word2vecDFrame, top_words_bow_df, top_features_scores_tfidf_df], axis=1)
df_tokenized.reset_index(drop=True, inplace=True)
word2vecDFrame.reset_index(drop=True, inplace=True)
top_words_bow_df.reset_index(drop=True, inplace=True)
top_features_scores_tfidf_df.reset_index(drop=True, inplace=True)

#df_tokenized = pd.merge(df_tokenized, word2vecDFrame, top_words_bow_df, top_features_scores_tfidf_df, left_index=True, right_index=True)
df_tokenized = pd.concat([df_tokenized, word2vecDFrame, top_words_bow_df, top_features_scores_tfidf_df], axis=1)
df_tokenized.shape

# Status

In [None]:
g = sns.displot(data=df, x="Status", height=5, aspect=1)
g.set(title ='Histogram of Status')

# Correlation

In [None]:
from sklearn import preprocessing

df_encode = df.copy()
le = preprocessing.LabelEncoder()
df_encode.drop(columns=['Problem','Description'], inplace=True)
#encode Status [Accepted, Error, Wrong Answer] to [0,1,2]
df_encode['Status'] = le.fit_transform(df_encode['Status'])
df_encode['Status'].value_counts()
#df_encode.columns


In [None]:
sns.set(rc={'figure.figsize':[20,10]}, font_scale=1.25)
sns.heatmap(df_encode.corr(), cmap="YlGnBu", annot=True)

# Unpass tests

In [None]:
sns.pairplot(df_encode, hue='Status')

# Prepare train and test data, using only textstat features

Status is ungrouped

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.utils import resample

train, test = train_test_split(df_encode, test_size=0.2, stratify=df_encode['Status'].values)

# Status
# 4 wa     1945
# 2 rte     817
# 0 ac      386
# 3 tle     146
# 1 mle      29

#upsampling the minority classes Accepted and Errors
ac = train[train.Status==0]
mle = train[train.Status==1]
rte = train[train.Status==2]
tle = train[train.Status==3]
wa = train[train.Status==4]

ac_upsampled = resample(ac, replace=True, # sample with replacement
                        n_samples=len(wa)) # match number in majority class
                        

mle_upsampled = resample(mle, replace=True,
                        n_samples=len(wa))
                        

tle_upsampled = resample(tle, replace=True,
                        n_samples=len(wa))
                    

rte_upsampled = resample(rte, replace=True,
                        n_samples=len(wa))
                        

train_upsampled = pd.concat([wa, rte_upsampled, ac_upsampled, tle_upsampled, mle_upsampled])

X_train, y_train = train_upsampled.drop(columns=['Total','Status']), train_upsampled['Status']
X_test, y_test = test.drop(columns=['Total','Status']), test['Status']

#scaling features
scaler = preprocessing.StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

#convert to dataframe
X_train_scaled = pd.DataFrame(X_train_scaled, columns=X_train.columns)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=X_test.columns)


Status is grouped

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.utils import resample

train, test = train_test_split(df_encode, test_size=0.2, stratify=df_encode['Status'].values)

# grouping status
# 2 wa    1945
# 1 er    992
# 0 ac     386

#upsampling the minority classes Accepted and Errors
ac = train[train.Status==0]
er = train[train.Status==1]
wa = train[train.Status==2]

ac_upsampled = resample(ac, replace=True, # sample with replacement
                        n_samples=len(wa)) # match number in majority class
                        

er_upsampled = resample(er, replace=True,
                        n_samples=len(wa))
                    

train_upsampled = pd.concat([wa, er_upsampled, ac_upsampled])

X_train, y_train = train_upsampled.drop(columns=['Total','Status']), train_upsampled['Status']
X_test, y_test = test.drop(columns=['Total','Status']), test['Status']

#scaling features
scaler = preprocessing.StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

#convert to dataframe
X_train_scaled = pd.DataFrame(X_train_scaled, columns=X_train.columns)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=X_test.columns)

# Prepare train and test data, using all features (textstat + tokenized features)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.utils import resample

df_allfeatures_encode = df_tokenized.copy()
le = preprocessing.LabelEncoder()
df_allfeatures_encode.drop(columns=['Problem','Description'], inplace=True)
# encode Status [Accepted, Error, Wrong Answer] to [0,1,2]
df_allfeatures_encode['Status'] = le.fit_transform(df_allfeatures_encode['Status'])

train_allfeatures, test_allfeatures = train_test_split(df_allfeatures_encode, test_size=0.2, stratify=df_allfeatures_encode['Status'].values)

#upsampling the minority classes Accepted and Errors
ac_all = train_allfeatures[train_allfeatures.Status==0]
er_all = train_allfeatures[train_allfeatures.Status==1]
wa_all = train_allfeatures[train_allfeatures.Status==2]

ac_all_upsampled = resample(ac_all, replace=True, # sample with replacement
                        n_samples=len(wa_all)) # match number in majority class
                        

er_all_upsampled = resample(er_all, replace=True,
                        n_samples=len(wa_all))
                    

train_allfeatures_upsampled = pd.concat([wa_all, er_all_upsampled, ac_all_upsampled])

X_allfeatures_train, y_allfeatures_train = train_allfeatures_upsampled.drop(columns=['Total','Status']), train_allfeatures_upsampled['Status']
X_allfeatures_test, y_allfeatures_test = test_allfeatures.drop(columns=['Total','Status']), test_allfeatures['Status']

#scaling features
scaler_all = preprocessing.StandardScaler()
X_allfeatures_train_scaled = scaler_all.fit_transform(X_allfeatures_train)
X_allfeatures_test_scaled = scaler_all.transform(X_allfeatures_test)

#convert to dataframe
X_allfeatures_train_scaled = pd.DataFrame(X_allfeatures_train_scaled, columns=X_allfeatures_train.columns)
X_allfeatures_test_scaled = pd.DataFrame(X_allfeatures_test_scaled, columns=X_allfeatures_test.columns)

# Select K Best

Only textstat features

In [None]:
from sklearn.feature_selection import SelectKBest, chi2, f_classif


kBest = SelectKBest(score_func = f_classif, k=14)
kBestFeatures = kBest.fit(X_train_scaled.values, y_train.values)

dfscores = pd.DataFrame(kBestFeatures.scores_)
dfcolumns = pd.DataFrame(X_train_scaled.columns)

featureScores = pd.concat([dfcolumns, dfscores],axis=1)
featureScores.columns = ['Feature','Score']
print(featureScores.nlargest(20,'Score'))

top_12 = featureScores.nlargest(12, 'Score')
print("top 12 scores: \n", top_12)

In [None]:
# Extract the feature names and scores
top_12_feature_names = top_12['Feature'].to_numpy()
top_12_scores = top_12['Score'].to_numpy()

# Extract the top 10 features from the scaled training data
X_train_scaled_top_12 = X_train_scaled[top_12_feature_names]
plt.bar(top_12_feature_names, top_12_scores)

plt.xticks(rotation=45, ha='right')
plt.xlabel('Feature', fontsize=12)
plt.ylabel('Score', fontsize=12)
plt.title('Top 12 Feature Scores', fontsize=14)
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()

In [None]:
cc = pd.concat([X_train_scaled_top_12,y_train.reset_index(drop=True)], axis = 1)
sns.pairplot(cc, hue='Status')

# K-means

Only textstat features

In [None]:
from sklearn.cluster import KMeans
from yellowbrick.cluster import KElbowVisualizer
from sklearn import preprocessing


km = KMeans()
visualizer = KElbowVisualizer(km, k=(2,30))

visualizer.fit(X_train_scaled_top_12)        
visualizer.show()        

In [None]:
from sklearn.cluster import KMeans

wcss = []
for i in range(1, 31):
    km = KMeans(n_clusters = i, init = 'k-means++', max_iter = 300, n_init = 30)
    km.fit(X_train_scaled_top_12)
    wcss.append(km.inertia_)

In [None]:
plt.plot(range(1, 31), wcss, marker = 'o')
plt.xticks(range(1, 31, 1))
plt.title('The Elbow Method')
plt.xlabel('No. of Clusters')
plt.ylabel('wcss')
plt.show()

In [None]:
#X_train_scaled_top_12_Kmeans = X_train_scaled_top_12.drop(columns=['Pass', 'Unpass'])
#concat training features and label
kmeans_Xtrain = pd.concat([X_train_scaled_top_12, pd.DataFrame(y_train).reset_index(drop=True)], axis=1)
kmeans_Xtrain.drop(columns=['Pass', 'Unpass'], inplace=True)
kmeans = KMeans(n_clusters = 3)
kmeans.fit(kmeans_Xtrain)
y_kmeans = kmeans.predict(kmeans_Xtrain)
#X.drop(columns=['prediction'], inplace=True)
X_train_scaled_top_12_K_means = kmeans_Xtrain.copy()
X_train_scaled_top_12_K_means['K_means_prediction'] = y_kmeans

In [None]:
from pandas.plotting import parallel_coordinates
from itertools import cycle, islice

my_colors = list(islice(cycle(['b', 'r', 'g']), None, len(y_kmeans)))
plt.figure(figsize=(18,5))
centroids =  pd.DataFrame(kmeans.cluster_centers_)
centroid_labels = kmeans.predict(centroids)
centroids['K_means_prediction'] = centroid_labels
centroids.columns = X_train_scaled_top_12_K_means.columns
parallel_coordinates(centroids, 'K_means_prediction', color = my_colors, marker='o')

# Select k Best using all features

In [None]:
from sklearn.feature_selection import SelectKBest, chi2, f_classif

X_allfeatures_train_scaled_KBest = X_allfeatures_train_scaled #.drop(columns=['Pass', 'Unpass'])
kBest_all = SelectKBest(score_func = f_classif, k=14)
kBestFeatures_all = kBest_all.fit(X_allfeatures_train_scaled_KBest.values, y_allfeatures_train.values)

dfscores_all = pd.DataFrame(kBestFeatures_all.scores_)
dfcolumns_all = pd.DataFrame(X_allfeatures_train_scaled_KBest.columns)

featureScores_all = pd.concat([dfcolumns_all, dfscores_all],axis=1)
featureScores_all.columns = ['Feature','Score']
print(featureScores_all.nlargest(150,'Score'))

top_150_all = featureScores_all.nlargest(150, 'Score')
#print("top 12 scores: \n", top_12_all)

In [None]:
# Extract the feature names and scores
top_150_feature_names_all = top_150_all['Feature'].to_numpy()
top_150_scores_all = top_150_all['Score'].to_numpy()

X_allfeatures_train_scaled_top150 = X_allfeatures_train_scaled[top_150_feature_names_all]
plt.bar(top_150_feature_names_all, top_150_scores_all)


plt.xticks(rotation=45, ha='right')
plt.xlabel('Feature', fontsize=12)
plt.ylabel('Score', fontsize=12)
plt.title('Top 150 Feature Scores', fontsize=14)
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()

# PCA

In [None]:
from sklearn.decomposition import PCA

pca = PCA().fit(X_allfeatures_train_scaled_top150)

In [None]:
plt.bar(range(1, 151), pca.explained_variance_ratio_, alpha=0.5, align='center',
       label='Individual explained variance')
plt.step(range(1, 151), np.cumsum(pca.explained_variance_ratio_), where='mid',
         label='Cumulative explained variance')
plt.ylabel('Explained variance ratio')
plt.xlabel('Principal component index')
plt.legend(loc='best')
plt.tight_layout()

In [None]:
pca = PCA(n_components=60)
X_train_pca = pca.fit_transform(X_allfeatures_train_scaled_top150)


In [None]:
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV

mlp_param_grid = {
    'hidden_layer_sizes': [(50,), (50,100), (50, 100, 50), (50, 100, 50, 10), (100,), (100, 200), (100, 200, 100), (100, 200, 100, 10), (200,), (200, 300), (200, 300, 100), (200, 300, 100, 10), (300,), (300, 400), (300, 400, 200), (300, 400, 200,10)], 
    'activation' : ['logistic', 'tanh', 'relu'], 
    'solver' : ['sgd', 'adam'], 
    'alpha': [0.000001, 0.00001, 0.0001, 0.001, 0.01, 0.1, 1], 
    'learning_rate': ['adaptive'],
    'early_stopping': [True],
    'max_iter': [500, 1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000, 9000, 10000],
    'tol': [0.0001, 0.001, 0.01]  
}

mlp_grid_search = GridSearchCV(
    estimator=MLPClassifier(),
    param_grid=mlp_param_grid,
    cv=10,  
    n_jobs=-1,  
    verbose=1,  
    scoring='f1_micro'
)

mlp_grid_search.fit(X_train_pca, y_train)

In [None]:
# test data
X_allfeatures_test_scaled_top150 = X_allfeatures_test_scaled[top_150_feature_names_all]
X_test_pca = pca.transform(X_allfeatures_test_scaled_top150)


In [None]:
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

mlp_best_params = mlp_grid_search.best_params_
print("Best Hyperparameters:", mlp_best_params)

# Get the best model found by GridSearchCV
mlp_best_model = mlp_grid_search.best_estimator_
y_pred = mlp_best_model.predict(X_test_pca)
print(confusion_matrix(y_test,y_pred))
print("ANN Classification Report:")
report = classification_report(y_test, y_pred)
print(report)
print(accuracy_score(y_test,y_pred))

# Random forest model


In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

# Parameter grid
rf_param_grid = {
    'n_estimators': [10, 50, 100, 150, 200, 250, 300, 350, 400, 450, 500, 550, 600, 650, 700, 750, 800, 850, 900, 950, 1000],
    'max_depth': [None, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80, 85, 90, 95, 100],
    'min_samples_split': [2, 5, 10, 15, 20],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

rf = RandomForestClassifier()

rf_grid_search = GridSearchCV(estimator=rf,
                           param_grid=rf_param_grid,
                           scoring='f1_micro',
                           cv=10,
                           n_jobs=-1,
                           verbose=2)

rf_grid_search.fit(X_train_pca, y_train)


In [None]:
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

rf_best_params = rf_grid_search.best_params_
print("Best Hyperparameters:", rf_best_params)

rf_best_model = rf_grid_search.best_estimator_
y_pred = rf_best_model.predict(X_test_pca)
print(confusion_matrix(y_test,y_pred))
print("Random Forest Classification Report:")
report = classification_report(y_test, y_pred)
print(report)
print(accuracy_score(y_test,y_pred))

# Support Vector Machine

In [None]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV

# Parameter grid
svc_param_grid = {
    'C': [0.00001, 0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 200, 300, 400, 500, 600, 700, 800, 900, 1000],
    'gamma': [1000, 100, 10, 1, 0.1, 0.01, 0.001, 0.0001, 0.00001],
    'kernel': ['rbf', 'poly', 'sigmoid']
}

svc = SVC()
svc_grid_search = GridSearchCV(estimator=svc,
                           param_grid=svc_param_grid,
                           scoring='f1_micro',
                           cv=10,
                           n_jobs=-1, verbose=2)

svc_grid_search.fit(X_train_pca, y_train)

In [None]:
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

svc_best_params = svc_grid_search.best_params_
print("Best Hyperparameters:", svc_best_params)

svc_best_model = svc_grid_search.best_estimator_
y_pred = svc_best_model.predict(X_test_pca)
print(confusion_matrix(y_test,y_pred))
print("SVM Classification Report:")
report = classification_report(y_test, y_pred)
print(report)
print(accuracy_score(y_test,y_pred))