# Connect To Drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')
root = "drive/MyDrive/CMP/"

Mounted at /content/drive


# Local

In [None]:
# root = './'

# Import Libraries

In [None]:
import numpy as np
import pandas as pd
import pickle
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics

# Data Preparation

In [None]:
# Loading subtlex data
SUBTLEX_df = pd.read_excel(root+'Datasets/SUBTLEX_Zipf.xlsx')

# renaming columns for latter merge use and dropping unneccery columns
SUBTLEX_df.rename(columns={'Word':'string', 'Zipf-value': 'zipf'}, inplace=True)
SUBTLEX_df.drop(SUBTLEX_df.columns[1:-1], axis=1, inplace=True)
SUBTLEX_df.head()

Unnamed: 0,string,zipf
0,a,7.30936
1,aa,3.236317
2,aaa,2.706807
3,aah,4.721425
4,aahed,1.592864


In [None]:
SUBTLEX_df['zipf'].describe()

count    74286.000000
mean         2.479172
std          0.835829
min          1.592864
25%          1.768955
50%          2.291834
75%          2.954592
max          7.621174
Name: zipf, dtype: float64

In [None]:
def categorise_by_zipf(row):
    """
    categorizing data string as High frequency word (HF),
    Low frequency word (LF) and non-word (NW) by zipf value

    Parameters
    ----------
        row: pandas series
            pandas dataframe's row containing label and zipf column
    
    Returns:
        category (str): HF, LF, NW
    """
    category = ''
    if row['label'] == 0:
        category = 'NW'
    elif row['label'] == 1 and row['zipf'] > 3:
        category = 'HF'
    else:
        category = 'LF'
    return category

In [None]:
with open(root+'Datasets/FastText_V3/words_2M.pkl', 'rb') as f:
  words = pickle.load(f)

with open(root+'Datasets/FastText_V3/nonwords_2M.pkl', 'rb') as f:
  nonwords = pickle.load(f)

In [None]:
for w in words:
    w.insert(3, [1, 0])
    w.insert(4, 1)

for nw in nonwords:
    nw.insert(1, 0)
    nw.insert(3, [0, 1])
    nw.insert(4, 0)

In [None]:
# loading elp words and non-words as seprate dataframes
elp_words = pd.DataFrame(words, columns =['string', 'freq', 'represention', 'code', 'label'])
elp_nonwords = pd.DataFrame(nonwords, columns =['string', 'freq', 'represention', 'code', 'label'])

# mergeing elp words with subtlex words to get zipf for each elp words
# elp words that does not have zipf will be droped
elp_words = pd.merge(elp_words, SUBTLEX_df, on='string', how='left').dropna()
elp_df = pd.concat([elp_words, elp_nonwords])

# 
elp_df.fillna(0, inplace=True)
elp_df['category'] = elp_df.apply(lambda row: categorise_by_zipf(row), axis=1)
elp_df.drop_duplicates(subset=['string'], inplace=True)

In [None]:
elp_df = elp_df.sample(frac=1.0)
elp_df.head()

Unnamed: 0,string,freq,represention,code,label,zipf,category
25492,pass,48574,"[-0.010765917, -0.07305459, 0.09320474, 0.0219...","[1, 0]",1,5.03338,HF
18657,iptervention,0,"[-0.0030537408, 0.062397406, -0.0118350135, 0....","[0, 1]",0,0.0,NW
24294,ognament,0,"[-0.016061535, 0.06521283, 0.050785653, 0.0017...","[0, 1]",0,0.0,NW
2648,caths,0,"[-0.013950883, -0.020579163, 0.040436383, -0.0...","[0, 1]",0,0.0,NW
40236,wreak,453,"[-0.03075079, -0.039086945, 0.13320665, -0.083...","[1, 0]",1,2.783196,LF


In [None]:
elp_df.drop_duplicates(subset=['string'], inplace=True)

In [None]:
elp_df.groupby('category').count()

Unnamed: 0_level_0,string,freq,represention,code,label,zipf
category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
HF,14333,14333,14333,14333,14333,14333
LF,19645,19645,19645,19645,19645,19645
NW,40339,40339,40339,40339,40339,40339


In [None]:
X_train, X_test, y_train, y_test = train_test_split(np.array(list(elp_df['represention'])), elp_df['label'].to_numpy(), test_size=0.2, shuffle=True)

# Random Forest

In [None]:
param_grid = { 
    'n_estimators': [100, 500],
    'max_features': ['None', 'sqrt', 'log2'],
    'max_depth' : [None, 6, 7, 8],
    'criterion' :['gini', 'entropy']
}
rf_clf = RandomForestClassifier()
rf_cv = GridSearchCV(rf_clf, param_grid, cv=5, n_jobs=7, verbose=10)
rf_cv.fit(X_train, y_train)

Fitting 5 folds for each of 48 candidates, totalling 240 fits


In [None]:
print("tuned hpyerparameters :(best parameters) ", rf_cv.best_params_)
print("best accuracy :", rf_cv.best_score_)
print("mean accuracies :", rf_cv.cv_results_['mean_test_score'])

In [None]:
RF_classifier = RandomForestClassifier(n_estimators=rf_cv.best_params_['n_estimators'],
                                       max_features=rf_cv.best_params_['max_features'],
                                       max_depth=rf_cv.best_params_['max_depth'],
                                       criterion=rf_cv.best_params_['criterion'],
                                       n_jobs=-1,
                                       verbose=1)
RF_classifier.fit(X_train, y_train)

In [None]:
score = RF_classifier.score(X_test, y_test)
print(score)

In [None]:
test_predictions = RF_classifier.predict(X_test)
cm = metrics.confusion_matrix(y_test, test_predictions)
plt.figure(figsize=(9,9))
sns.heatmap(cm, annot=True, fmt=".0f", linewidths=.5, square = True, cmap = 'Blues_r');
plt.ylabel('Actual label');
plt.xlabel('Predicted label');
all_sample_title = 'Accuracy Score: {0}'.format(score)
plt.title(all_sample_title, size = 15);
plt.savefig(root+'Plots/Classification/RF_FT_confusion matrix.pdf')

# Get Probabilities on full dataset


In [None]:
full_predictions = RF_classifier.predict(np.array(list(elp_df['represention'])))
score = RF_classifier.score(np.array(list(elp_df['represention'])), elp_df['label'].to_numpy())
print(score)

In [None]:
print(metrics.classification_report(elp_df['label'].to_numpy(), full_predictions, digits=4))

In [None]:
cm = metrics.confusion_matrix(elp_df['label'].to_numpy(), full_predictions)
plt.figure(figsize=(9,9))
sns.heatmap(cm, annot=True, fmt=".0f", linewidths=.5, square = True, cmap = 'Blues_r');
plt.ylabel('Actual label');
plt.xlabel('Predicted label');
all_sample_title = 'Accuracy Score: {0}'.format(score)
plt.title(all_sample_title, size = 15);
plt.savefig(root+'RF_full_ACC')

In [None]:
full_dataset_predictions = RF_classifier.predict_proba(np.array(list(elp_df['represention'])))

In [None]:
elp_df['word_prob'] = full_dataset_predictions[:,1]
elp_df['nword_prob'] = full_dataset_predictions[:,0]

In [None]:
elp_df.groupby(['category']).agg({'word_prob': ['mean', 'std', 'count', 'max', 'min'], 'nword_prob': ['mean', 'std', 'count', 'max', 'min']})

In [None]:
word_df_to_save = elp_df.drop(['represention'], axis=1)

In [None]:
word_df_to_save.to_csv(root+"Datasets/fastText_RF.csv", header=0, index=False)