In [1]:
# DOWNLOAD LEMMATISER DATA
# `nltk.download('punkt')` and `nltk.download('wordnet')` are used to download the necessary resources for tokenization and lemmatization.
# 1. `nltk.download('punkt')`: This downloads the Punkt tokenizer. This is a pre-trained unsupervised machine learning model for tokenizing text. It's used by `nltk.word_tokenize()` to split the input text into words and punctuation.
# 2. `nltk.download('wordnet')`: This downloads WordNet, a large lexical database of English. Nouns, verbs, adjectives, and adverbs are grouped into sets of cognitive synonyms (synsets), each expressing a distinct concept. The WordNetLemmatizer uses WordNet to look up lemmas, or root forms of words.
from google.colab import drive
import nltk

# Mount Google Drive
drive.mount('/content/drive/')

# Set NLTK data path to Google Drive
nltk.data.path.append("/content/drive/My Drive/SMU TMLP/nltk_data")

# Download NLTK data to Google Drive
nltk.download('punkt', download_dir="/content/drive/My Drive/SMU TMLP/nltk_data")
nltk.download('wordnet', download_dir="/content/drive/My Drive/SMU TMLP/nltk_data")

Mounted at /content/drive/


[nltk_data] Downloading package punkt to /content/drive/My Drive/SMU
[nltk_data]     TMLP/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /content/drive/My Drive/SMU
[nltk_data]     TMLP/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [2]:
# Reading the data from Google Drive

from google.colab import drive
import os

drive.mount('/content/drive/')

os.listdir('/content/drive/My Drive/SMU TMLP') # # list the contents of the directory

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


['EDA.ipynb',
 'IS450 Text Mining LDA Modelling (Edited).zip',
 'enron clean.xlsx',
 'nltk_data',
 'enron_clean_reduced_temp_KHCWSMerge.csv',
 'Merge Data',
 'Feature Correlation 2.ipynb',
 '.ipynb_checkpoints',
 'Word Docs',
 'enron_clean_V2.csv',
 'Kheng',
 'Joy',
 'Jeremy',
 'Random Forest WS.ipynb',
 'Mutual Information Feature Selection Copy.ipynb',
 'enron_clean.csv',
 'Literature Review Methods Used for Analysis',
 'LDA_TopicModeling_Enron Fraud FE + PCA (Copy Latest Colab WS).ipynb',
 'Topic Clustering.ipynb',
 'XGBoost Extreme Gradient Boosting WS.ipynb']

In [3]:
import pandas as pd

# Load the dataset
file_path = "/content/drive/MyDrive/SMU TMLP/enron_clean_V2.csv"
df = pd.read_csv(file_path)

# Set the maximum number of columns to display
pd.set_option('display.max_columns', None)

# Display the first few rows of the dataset
df

Unnamed: 0,Year,Month,Day,Hour,Day_of_week,sentiment_score_compound,sentiment_score_positive,sentiment_score_neutral,sentiment_score_negative,Sender-Type,Unique-Mails-From-Sender,Contains-Reply-Forwards,Subject_lemma,Body_lemma,Subject_num_sentences,Subject_median_chars_per_word,Subject_median_words_per_sentence,Subject_uppercase_ratio,Subject_punctuation_ratio,Subject_typo_ratio,Subject_special_chars_ratio,Body_num_sentences,Body_median_chars_per_word,Body_median_words_per_sentence,Body_uppercase_ratio,Body_punctuation_ratio,Body_typo_ratio,Body_special_chars_ratio,Bcc_count,Label,Body_lexical_complexity,Body_lemma_dominant_topic,Subject_lemma_dominant_topic,VBZ,VBD,CD,PRP,VB,VBN,RB,TO,RP,VBG,NNP,VBP,JJ,NNS,IN,DT,CC,JJS,WP,NN,JJR
0,2000,11,16,17,3,0.9616,0.226,0.708,0.066,0,0.003281,0,status,status john : I be not really sure what happen...,0.0,0.063158,0.027027,0.166667,0.000000,0.000000,0.000000,0.017204,0.052632,0.012067,0.028716,0.055743,0.036036,0.055743,0.000000,0,0.923077,11,5,1,3,2,10,7,2,9,4,2,1,9,8,8,1,12,8,2,1,1,13,0
1,2000,12,8,13,4,0.5940,0.074,0.926,0.000,0,0.000729,0,re : summer inverse,re : summer inverse I suck - hope you ve make ...,0.0,0.089474,0.054054,0.000000,0.055556,0.000000,0.055556,0.004301,0.052632,0.018996,0.000000,0.030508,0.140351,0.030508,0.000000,0,0.947368,3,11,0,1,1,1,4,0,4,1,0,1,0,5,8,3,3,4,2,0,0,15,1
2,2001,5,15,16,1,-0.3923,0.036,0.918,0.045,0,0.000547,0,the wti bullet swap contract,"the wti bullet swap contract hi , follow the e...",0.0,0.042105,0.135135,0.172414,0.000000,0.200000,0.000000,0.004301,0.052632,0.062007,0.074792,0.042475,0.088889,0.042475,0.000000,0,0.775862,4,9,1,7,21,6,7,5,4,5,0,8,43,4,6,10,9,9,1,0,1,26,0
3,2000,12,12,7,1,-0.4767,0.000,0.829,0.171,0,0.001641,1,fwd : nytimes.com article : suspend rabbi quit...,fwd : nytimes.com article : suspend rabbi quit...,0.0,0.084211,0.216216,0.149254,0.044776,0.111111,0.044776,0.006452,0.111842,0.004301,0.076923,0.117949,0.310345,0.117949,0.000000,0,0.617647,2,5,0,1,1,0,1,1,0,1,0,0,10,0,2,0,1,0,0,0,0,10,0
4,2001,5,15,6,1,-0.0772,0.058,0.893,0.050,0,0.064163,0,daily chart and matrix as hot link 5/15,daily chart and matrix as hot link 5/15 the in...,0.0,0.047368,0.216216,0.000000,0.023256,0.000000,0.023256,0.017204,0.052632,0.022700,0.052726,0.035746,0.010101,0.035746,0.000000,0,0.877143,0,2,2,3,5,9,11,7,4,6,0,1,45,5,14,17,23,16,11,0,0,19,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
433069,2001,9,21,18,4,0.0000,0.000,1.000,0.000,1,0.001276,0,review board book w / rebecca \n c./billb / d...,review board book w / rebecca c./billb / daveg...,0.0,0.063158,0.135135,0.229885,0.114943,0.571429,0.114943,0.002151,0.078947,0.010753,0.201058,0.126984,0.264706,0.126984,0.000000,0,0.942308,10,8,0,1,3,0,0,0,0,0,0,0,12,0,1,0,0,0,0,0,0,1,0
433070,2001,9,4,15,1,0.0000,0.000,1.000,0.000,1,0.008385,0,audit committee material meeting,audit committee material meet two meeting have...,0.0,0.084211,0.108108,0.090909,0.000000,0.000000,0.000000,0.002151,0.078947,0.024731,0.089744,0.028846,0.060000,0.028846,0.004386,0,0.818116,11,4,0,0,6,0,1,2,0,1,0,1,21,1,1,3,2,2,2,0,0,3,0
433071,2001,11,5,9,0,0.9758,0.118,0.851,0.031,1,0.011666,0,credit story,"credit story rick / bill / david , generally ,...",0.0,0.057895,0.054054,0.166667,0.000000,0.000000,0.000000,0.017204,0.052632,0.029630,0.028217,0.025465,0.011858,0.025465,0.000000,0,0.896825,11,4,6,0,0,22,16,3,15,15,2,12,14,17,18,20,23,19,10,0,0,21,3
433072,2001,10,17,23,2,0.9658,0.067,0.905,0.028,1,0.012213,0,commodity group limit issue,commodity group limit issue in addition to the...,0.0,0.052632,0.108108,0.148148,0.000000,0.000000,0.000000,0.040860,0.052632,0.029194,0.064918,0.038695,0.045139,0.038695,0.002193,0,0.902439,4,8,19,11,25,21,22,14,34,12,2,6,123,6,14,12,57,63,22,1,0,61,0


In [4]:
df.columns

Index(['Year', 'Month', 'Day', 'Hour', 'Day_of_week',
       'sentiment_score_compound', 'sentiment_score_positive',
       'sentiment_score_neutral', 'sentiment_score_negative', 'Sender-Type',
       'Unique-Mails-From-Sender', 'Contains-Reply-Forwards', 'Subject_lemma',
       'Body_lemma', 'Subject_num_sentences', 'Subject_median_chars_per_word',
       'Subject_median_words_per_sentence', 'Subject_uppercase_ratio',
       'Subject_punctuation_ratio', 'Subject_typo_ratio',
       'Subject_special_chars_ratio', 'Body_num_sentences',
       'Body_median_chars_per_word', 'Body_median_words_per_sentence',
       'Body_uppercase_ratio', 'Body_punctuation_ratio', 'Body_typo_ratio',
       'Body_special_chars_ratio', 'Bcc_count', 'Label',
       'Body_lexical_complexity', 'Body_lemma_dominant_topic',
       'Subject_lemma_dominant_topic', 'VBZ', 'VBD', 'CD', 'PRP', 'VB', 'VBN',
       'RB', 'TO', 'RP', 'VBG', 'NNP', 'VBP', 'JJ', 'NNS', 'IN', 'DT', 'CC',
       'JJS', 'WP', 'NN', 'JJR'],
  

In [5]:
import pandas as pd

# Load the dataset
file_path = "/content/drive/MyDrive/SMU TMLP (1)/enron_clean_reduced_temp_KHCWSMerge_final.csv"
df2 = pd.read_csv(file_path)

# Set the maximum number of columns to display
pd.set_option('display.max_columns', None)

# Display the first few rows of the dataset
df2

Unnamed: 0,Year,Month,Day,Hour,Day_of_week,sentiment_score_compound,sentiment_score_positive,sentiment_score_neutral,sentiment_score_negative,Sender-Type,Unique-Mails-From-Sender,Contains-Reply-Forwards,Subject_lemma,Body_lemma,Subject_num_words,Subject_num_sentences,Subject_median_chars_per_word,Subject_median_words_per_sentence,Subject_uppercase_ratio,Subject_punctuation_ratio,Subject_typo_ratio,Subject_special_chars_ratio,Body_num_words,Body_num_sentences,Body_median_chars_per_word,Body_median_words_per_sentence,Body_uppercase_ratio,Body_punctuation_ratio,Body_typo_ratio,Body_special_chars_ratio,Bcc_count,Label,From,X-Origin,Body,Bcc,Body_pos_frequency,Body_lexical_complexity,Body_word2vec_features,Subject_lemma_dominant_topic,Subject_lemma_doc2vec_features,Body_lemma_dominant_topic
0,2000,11,16,17,3,0.9616,0.226,0.708,0.066,0,0.003281,0,status,status john : I be not really sure what happen...,0.020408,0.0,0.063158,0.027027,0.166667,0.000000,0.000000,0.000000,0.018390,0.017204,0.052632,0.012067,0.028716,0.055743,0.036036,0.055743,0.000000,0,msagel@home.com,Arnold-J,Status John: I'm not really sure what happened...,,"{'NNP': 9, ':': 1, 'PRP': 10, 'VBP': 8, 'RB': ...",0.923077,[ 0.71445566 -2.3261511 1.2993109 1.448487...,5,[-0.14599033 -0.01694362 0.00184981 -0.065501...,11
1,2000,12,8,13,4,0.5940,0.074,0.926,0.000,0,0.000729,0,re : summer inverse,re : summer inverse I suck - hope you ve make ...,0.040816,0.0,0.089474,0.054054,0.000000,0.055556,0.000000,0.055556,0.009650,0.004301,0.052632,0.018996,0.000000,0.030508,0.140351,0.030508,0.000000,0,slafontaine@globalp.com,Arnold-J,re:summer inverses i suck-hope youve made more...,,"{'NN': 15, ':': 1, 'NNS': 3, 'VBP': 5, 'VBD': ...",0.947368,[ 0.95756644 -2.114178 0.8945547 1.367256...,11,[-0.0638589 0.00566847 0.01042745 -0.022350...,3
2,2001,5,15,16,1,-0.3923,0.036,0.918,0.045,0,0.000547,0,the wti bullet swap contract,"the wti bullet swap contract hi , follow the e...",0.102041,0.0,0.042105,0.135135,0.172414,0.000000,0.200000,0.000000,0.031500,0.004301,0.052632,0.062007,0.074792,0.042475,0.088889,0.042475,0.000000,0,iceoperations@intcx.com,Arnold-J,"The WTI Bullet swap contracts Hi, Following th...",,"{'DT': 9, 'NNP': 43, 'NN': 26, 'NNS': 10, ',':...",0.775862,[ 0.4515546 -1.7378796 0.4099782 1.249512...,9,[-1.24904305e-01 -9.08257589e-02 -9.20014083e-...,4
3,2000,12,12,7,1,-0.4767,0.000,0.829,0.171,0,0.001641,1,fwd : nytimes.com article : suspend rabbi quit...,fwd : nytimes.com article : suspend rabbi quit...,0.163265,0.0,0.084211,0.216216,0.149254,0.044776,0.111111,0.044776,0.002913,0.006452,0.111842,0.004301,0.076923,0.117949,0.310345,0.117949,0.000000,0,klarnold@flash.net,Arnold-J,Fwd: NYTimes.com Article: Suspended Rabbi Quit...,,"{'NN': 10, ':': 6, 'JJ': 2, 'VBN': 1, 'NNP': 1...",0.617647,[-1.03076708e+00 -1.98776089e-02 8.55472207e-...,5,[-0.08218867 -0.00136871 -0.05490851 0.002315...,2
4,2001,5,15,6,1,-0.0772,0.058,0.893,0.050,0,0.064163,0,daily chart and matrix as hot link 5/15,daily chart and matrix as hot link 5/15 the in...,0.163265,0.0,0.047368,0.216216,0.000000,0.023256,0.000000,0.023256,0.034596,0.017204,0.052632,0.022700,0.052726,0.035746,0.010101,0.035746,0.000000,0,soblander@carrfut.com,Arnold-J,daily charts and matrices as hot links 5/15 Th...,,"{'JJ': 14, 'NNS': 17, 'CC': 11, 'IN': 23, 'CD'...",0.877143,[ 0.4489542 -2.57727 1.3087538 1.816294...,2,[-0.10805397 0.0167327 -0.00174666 -0.057156...,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
433069,2001,9,21,18,4,0.0000,0.000,1.000,0.000,1,0.001276,0,review board book w / rebecca \n c./billb / d...,review board book w / rebecca c./billb / daveg...,0.102041,0.0,0.063158,0.135135,0.229885,0.114943,0.571429,0.114943,0.003642,0.002151,0.078947,0.010753,0.201058,0.126984,0.264706,0.126984,0.000000,0,bobbie.campbell@enron.com,Buy-R,Review Board Books w/Rebecca C./BillB/DaveG/Mi...,,"{'NNP': 12, 'VBD': 1, 'WRB': 2, ':': 2, ',': 2...",0.942308,[ 0.29830086 -0.19684805 0.08153074 -0.267179...,8,[-0.30864877 -0.08117718 -0.10863884 -0.230373...,10
433070,2001,9,4,15,1,0.0000,0.000,1.000,0.000,1,0.008385,0,audit committee material meeting,audit committee material meet two meeting have...,0.081633,0.0,0.084211,0.108108,0.090909,0.000000,0.000000,0.000000,0.008376,0.002151,0.078947,0.024731,0.089744,0.028846,0.060000,0.028846,0.004386,0,sharron.westbrook@enron.com,Buy-R,Audit Committee Materials meeting Two meetings...,"dortha.gray@enron.com, k..heathman@enron.com, ...","{'NNP': 21, 'VBG': 1, 'CD': 6, 'NNS': 3, 'VBP'...",0.818116,[ 0.31306246 -1.7426066 0.68086064 0.965420...,4,[-0.10764003 -0.00839344 -0.01368004 -0.056418...,11
433071,2001,11,5,9,0,0.9758,0.118,0.851,0.031,1,0.011666,0,credit story,"credit story rick / bill / david , generally ,...",0.040816,0.0,0.057895,0.054054,0.166667,0.000000,0.000000,0.000000,0.045157,0.017204,0.052632,0.029630,0.028217,0.025465,0.011858,0.025465,0.000000,0,ted.murphy@enron.com,Buy-R,"Credit Story Rick/Bill/David, Generally, we ha...",,"{'NNP': 14, ',': 14, 'PRP': 22, 'VBP': 17, 'VB...",0.896825,[ 0.89823025 -3.3463628 1.566664 2.430928...,4,[-2.0059045e-02 -2.3803629e-03 -2.4081659e-02 ...,11
433072,2001,10,17,23,2,0.9658,0.067,0.905,0.028,1,0.012213,0,commodity group limit issue,commodity group limit issue in addition to the...,0.081633,0.0,0.052632,0.108108,0.148148,0.000000,0.000000,0.000000,0.098871,0.040860,0.052632,0.029194,0.064918,0.038695,0.045139,0.038695,0.002193,0,cassandra.schultz@enron.com,Buy-R,Commodity Group Limit Issue In addition to the...,"frank.hayden@enron.com, ted.murphy@enron.com","{'NNP': 123, 'IN': 57, 'NN': 61, 'TO': 12, 'DT...",0.902439,[ 4.9536127e-01 -2.7233346e+00 1.1807437e+00 ...,8,[ 0.1241836 -0.03928145 -0.00246533 0.019151...,4


In [6]:
df2.columns

Index(['Year', 'Month', 'Day', 'Hour', 'Day_of_week',
       'sentiment_score_compound', 'sentiment_score_positive',
       'sentiment_score_neutral', 'sentiment_score_negative', 'Sender-Type',
       'Unique-Mails-From-Sender', 'Contains-Reply-Forwards', 'Subject_lemma',
       'Body_lemma', 'Subject_num_words', 'Subject_num_sentences',
       'Subject_median_chars_per_word', 'Subject_median_words_per_sentence',
       'Subject_uppercase_ratio', 'Subject_punctuation_ratio',
       'Subject_typo_ratio', 'Subject_special_chars_ratio', 'Body_num_words',
       'Body_num_sentences', 'Body_median_chars_per_word',
       'Body_median_words_per_sentence', 'Body_uppercase_ratio',
       'Body_punctuation_ratio', 'Body_typo_ratio', 'Body_special_chars_ratio',
       'Bcc_count', 'Label', 'From', 'X-Origin', 'Body', 'Bcc',
       'Body_pos_frequency', 'Body_lexical_complexity',
       'Body_word2vec_features', 'Subject_lemma_dominant_topic',
       'Subject_lemma_doc2vec_features', 'Body_lemma_

In [7]:
import pandas as pd

# Existing dataframes df and df2
# Find the columns that are in df2 but not in df
missing_columns = df2.columns.difference(df.columns)

# Merge the missing columns from df2 into df
df = df.merge(df2[missing_columns], left_index=True, right_index=True, how='left')

In [8]:
df

Unnamed: 0,Year,Month,Day,Hour,Day_of_week,sentiment_score_compound,sentiment_score_positive,sentiment_score_neutral,sentiment_score_negative,Sender-Type,Unique-Mails-From-Sender,Contains-Reply-Forwards,Subject_lemma,Body_lemma,Subject_num_sentences,Subject_median_chars_per_word,Subject_median_words_per_sentence,Subject_uppercase_ratio,Subject_punctuation_ratio,Subject_typo_ratio,Subject_special_chars_ratio,Body_num_sentences,Body_median_chars_per_word,Body_median_words_per_sentence,Body_uppercase_ratio,Body_punctuation_ratio,Body_typo_ratio,Body_special_chars_ratio,Bcc_count,Label,Body_lexical_complexity,Body_lemma_dominant_topic,Subject_lemma_dominant_topic,VBZ,VBD,CD,PRP,VB,VBN,RB,TO,RP,VBG,NNP,VBP,JJ,NNS,IN,DT,CC,JJS,WP,NN,JJR,Bcc,Body,Body_num_words,Body_pos_frequency,Body_word2vec_features,From,Subject_lemma_doc2vec_features,Subject_num_words,X-Origin
0,2000,11,16,17,3,0.9616,0.226,0.708,0.066,0,0.003281,0,status,status john : I be not really sure what happen...,0.0,0.063158,0.027027,0.166667,0.000000,0.000000,0.000000,0.017204,0.052632,0.012067,0.028716,0.055743,0.036036,0.055743,0.000000,0,0.923077,11,5,1,3,2,10,7,2,9,4,2,1,9,8,8,1,12,8,2,1,1,13,0,,Status John: I'm not really sure what happened...,0.018390,"{'NNP': 9, ':': 1, 'PRP': 10, 'VBP': 8, 'RB': ...",[ 0.71445566 -2.3261511 1.2993109 1.448487...,msagel@home.com,[-0.14599033 -0.01694362 0.00184981 -0.065501...,0.020408,Arnold-J
1,2000,12,8,13,4,0.5940,0.074,0.926,0.000,0,0.000729,0,re : summer inverse,re : summer inverse I suck - hope you ve make ...,0.0,0.089474,0.054054,0.000000,0.055556,0.000000,0.055556,0.004301,0.052632,0.018996,0.000000,0.030508,0.140351,0.030508,0.000000,0,0.947368,3,11,0,1,1,1,4,0,4,1,0,1,0,5,8,3,3,4,2,0,0,15,1,,re:summer inverses i suck-hope youve made more...,0.009650,"{'NN': 15, ':': 1, 'NNS': 3, 'VBP': 5, 'VBD': ...",[ 0.95756644 -2.114178 0.8945547 1.367256...,slafontaine@globalp.com,[-0.0638589 0.00566847 0.01042745 -0.022350...,0.040816,Arnold-J
2,2001,5,15,16,1,-0.3923,0.036,0.918,0.045,0,0.000547,0,the wti bullet swap contract,"the wti bullet swap contract hi , follow the e...",0.0,0.042105,0.135135,0.172414,0.000000,0.200000,0.000000,0.004301,0.052632,0.062007,0.074792,0.042475,0.088889,0.042475,0.000000,0,0.775862,4,9,1,7,21,6,7,5,4,5,0,8,43,4,6,10,9,9,1,0,1,26,0,,"The WTI Bullet swap contracts Hi, Following th...",0.031500,"{'DT': 9, 'NNP': 43, 'NN': 26, 'NNS': 10, ',':...",[ 0.4515546 -1.7378796 0.4099782 1.249512...,iceoperations@intcx.com,[-1.24904305e-01 -9.08257589e-02 -9.20014083e-...,0.102041,Arnold-J
3,2000,12,12,7,1,-0.4767,0.000,0.829,0.171,0,0.001641,1,fwd : nytimes.com article : suspend rabbi quit...,fwd : nytimes.com article : suspend rabbi quit...,0.0,0.084211,0.216216,0.149254,0.044776,0.111111,0.044776,0.006452,0.111842,0.004301,0.076923,0.117949,0.310345,0.117949,0.000000,0,0.617647,2,5,0,1,1,0,1,1,0,1,0,0,10,0,2,0,1,0,0,0,0,10,0,,Fwd: NYTimes.com Article: Suspended Rabbi Quit...,0.002913,"{'NN': 10, ':': 6, 'JJ': 2, 'VBN': 1, 'NNP': 1...",[-1.03076708e+00 -1.98776089e-02 8.55472207e-...,klarnold@flash.net,[-0.08218867 -0.00136871 -0.05490851 0.002315...,0.163265,Arnold-J
4,2001,5,15,6,1,-0.0772,0.058,0.893,0.050,0,0.064163,0,daily chart and matrix as hot link 5/15,daily chart and matrix as hot link 5/15 the in...,0.0,0.047368,0.216216,0.000000,0.023256,0.000000,0.023256,0.017204,0.052632,0.022700,0.052726,0.035746,0.010101,0.035746,0.000000,0,0.877143,0,2,2,3,5,9,11,7,4,6,0,1,45,5,14,17,23,16,11,0,0,19,0,,daily charts and matrices as hot links 5/15 Th...,0.034596,"{'JJ': 14, 'NNS': 17, 'CC': 11, 'IN': 23, 'CD'...",[ 0.4489542 -2.57727 1.3087538 1.816294...,soblander@carrfut.com,[-0.10805397 0.0167327 -0.00174666 -0.057156...,0.163265,Arnold-J
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
433069,2001,9,21,18,4,0.0000,0.000,1.000,0.000,1,0.001276,0,review board book w / rebecca \n c./billb / d...,review board book w / rebecca c./billb / daveg...,0.0,0.063158,0.135135,0.229885,0.114943,0.571429,0.114943,0.002151,0.078947,0.010753,0.201058,0.126984,0.264706,0.126984,0.000000,0,0.942308,10,8,0,1,3,0,0,0,0,0,0,0,12,0,1,0,0,0,0,0,0,1,0,,Review Board Books w/Rebecca C./BillB/DaveG/Mi...,0.003642,"{'NNP': 12, 'VBD': 1, 'WRB': 2, ':': 2, ',': 2...",[ 0.29830086 -0.19684805 0.08153074 -0.267179...,bobbie.campbell@enron.com,[-0.30864877 -0.08117718 -0.10863884 -0.230373...,0.102041,Buy-R
433070,2001,9,4,15,1,0.0000,0.000,1.000,0.000,1,0.008385,0,audit committee material meeting,audit committee material meet two meeting have...,0.0,0.084211,0.108108,0.090909,0.000000,0.000000,0.000000,0.002151,0.078947,0.024731,0.089744,0.028846,0.060000,0.028846,0.004386,0,0.818116,11,4,0,0,6,0,1,2,0,1,0,1,21,1,1,3,2,2,2,0,0,3,0,"dortha.gray@enron.com, k..heathman@enron.com, ...",Audit Committee Materials meeting Two meetings...,0.008376,"{'NNP': 21, 'VBG': 1, 'CD': 6, 'NNS': 3, 'VBP'...",[ 0.31306246 -1.7426066 0.68086064 0.965420...,sharron.westbrook@enron.com,[-0.10764003 -0.00839344 -0.01368004 -0.056418...,0.081633,Buy-R
433071,2001,11,5,9,0,0.9758,0.118,0.851,0.031,1,0.011666,0,credit story,"credit story rick / bill / david , generally ,...",0.0,0.057895,0.054054,0.166667,0.000000,0.000000,0.000000,0.017204,0.052632,0.029630,0.028217,0.025465,0.011858,0.025465,0.000000,0,0.896825,11,4,6,0,0,22,16,3,15,15,2,12,14,17,18,20,23,19,10,0,0,21,3,,"Credit Story Rick/Bill/David, Generally, we ha...",0.045157,"{'NNP': 14, ',': 14, 'PRP': 22, 'VBP': 17, 'VB...",[ 0.89823025 -3.3463628 1.566664 2.430928...,ted.murphy@enron.com,[-2.0059045e-02 -2.3803629e-03 -2.4081659e-02 ...,0.040816,Buy-R
433072,2001,10,17,23,2,0.9658,0.067,0.905,0.028,1,0.012213,0,commodity group limit issue,commodity group limit issue in addition to the...,0.0,0.052632,0.108108,0.148148,0.000000,0.000000,0.000000,0.040860,0.052632,0.029194,0.064918,0.038695,0.045139,0.038695,0.002193,0,0.902439,4,8,19,11,25,21,22,14,34,12,2,6,123,6,14,12,57,63,22,1,0,61,0,"frank.hayden@enron.com, ted.murphy@enron.com",Commodity Group Limit Issue In addition to the...,0.098871,"{'NNP': 123, 'IN': 57, 'NN': 61, 'TO': 12, 'DT...",[ 4.9536127e-01 -2.7233346e+00 1.1807437e+00 ...,cassandra.schultz@enron.com,[ 0.1241836 -0.03928145 -0.00246533 0.019151...,0.081633,Buy-R


In [9]:
df.columns

Index(['Year', 'Month', 'Day', 'Hour', 'Day_of_week',
       'sentiment_score_compound', 'sentiment_score_positive',
       'sentiment_score_neutral', 'sentiment_score_negative', 'Sender-Type',
       'Unique-Mails-From-Sender', 'Contains-Reply-Forwards', 'Subject_lemma',
       'Body_lemma', 'Subject_num_sentences', 'Subject_median_chars_per_word',
       'Subject_median_words_per_sentence', 'Subject_uppercase_ratio',
       'Subject_punctuation_ratio', 'Subject_typo_ratio',
       'Subject_special_chars_ratio', 'Body_num_sentences',
       'Body_median_chars_per_word', 'Body_median_words_per_sentence',
       'Body_uppercase_ratio', 'Body_punctuation_ratio', 'Body_typo_ratio',
       'Body_special_chars_ratio', 'Bcc_count', 'Label',
       'Body_lexical_complexity', 'Body_lemma_dominant_topic',
       'Subject_lemma_dominant_topic', 'VBZ', 'VBD', 'CD', 'PRP', 'VB', 'VBN',
       'RB', 'TO', 'RP', 'VBG', 'NNP', 'VBP', 'JJ', 'NNS', 'IN', 'DT', 'CC',
       'JJS', 'WP', 'NN', 'JJR', 'Bc

**Data Preprocessing:** Prepare the data for training by handling missing values, encoding categorical variables (if any), and splitting the data into features (X) and target variable (y).

**Splitting the Data:** Split the dataset into training and testing sets.

**Model Training:** Train the Random Forest model on the training data.

**Model Evaluation:** Evaluate the trained model on the testing data to assess its performance.

**Feature Importance:** Analyze feature importance to understand which features are most influential in making predictions.

**1. Data Preprocessing**

In [None]:
# Assuming 'Label' is the target variable
X = df.drop(columns=['Label'])  # Features
y = df['Label']  # Target variable

# Handle missing values if any (not necessary if there are no missing values)
# Encoding categorical variables if any (not necessary if all features are numeric)

**2. Splitting the Data**

In [None]:
from sklearn.model_selection import train_test_split

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

**3. Label Encoding**

In [None]:
from sklearn.preprocessing import LabelEncoder

# Initialize the label encoder
label_encoder = LabelEncoder()

# Identify categorical columns
categorical_cols = X_train.select_dtypes(include=['object']).columns

# Encode categorical variables in X_train
for col in categorical_cols:
    X_train[col] = label_encoder.fit_transform(X_train[col])

# Encode categorical variable y_train
y_train = label_encoder.fit_transform(y_train)

**4. Model Training**

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, roc_auc_score

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# Initialize the Random Forest classifier
rf_classifier = RandomForestClassifier(random_state=42)

# Train the Random Forest model on the training data
rf_classifier.fit(X_train, y_train)

# Make predictions on the testing data
y_pred = rf_classifier.predict(X_test)

# Calculate the AUC-ROC score
auc_roc = roc_auc_score(y_test, y_pred)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("AUC-ROC Score:", auc_roc)

Accuracy: 0.9971858223171506

Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00     68945
           1       0.99      0.44      0.61       347

    accuracy                           1.00     69292
   macro avg       0.99      0.72      0.81     69292
weighted avg       1.00      1.00      1.00     69292

AUC-ROC Score: 0.7218875129760325


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, roc_auc_score
from sklearn.preprocessing import LabelEncoder

# Assuming 'Label' is the target variable
X = df.drop(columns=['Label'])  # Features
y = df['Label']  # Target variable

# Initialize the label encoder
label_encoder = LabelEncoder()

# Identify and encode categorical variables
categorical_cols = X.select_dtypes(include=['object']).columns
for col in categorical_cols:
    X[col] = label_encoder.fit_transform(X[col])

# Encode the target variable
y = label_encoder.fit_transform(y)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the Random Forest classifier
rf_classifier = RandomForestClassifier(random_state=42)

# Train the Random Forest model on the training data
rf_classifier.fit(X_train, y_train)

# Make predictions on the testing data
y_pred = rf_classifier.predict(X_test)

# Calculate the AUC-ROC score
auc_roc = roc_auc_score(y_test, y_pred)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("AUC-ROC Score:", auc_roc)

Accuracy: 0.9975177509669226

Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00     86160
           1       0.99      0.53      0.69       455

    accuracy                           1.00     86615
   macro avg       0.99      0.77      0.85     86615
weighted avg       1.00      1.00      1.00     86615

AUC-ROC Score: 0.7659224596202312


**Evaluation of the Random Forest classifier on the testing data.**

**Accuracy:**
Accuracy measures the proportion of correctly classified instances out of all instances in the testing dataset.
The accuracy achieved by the model is approximately 99.75%, which indicates that the model correctly classified about 99.75% of the instances in the testing data.

**Classification Report:**

**Precision:** Precision measures the accuracy of the positive predictions. It is the ratio of correctly predicted positive observations to the total predicted positives.

**For class 0 (label 0):** Precision is 100%, meaning that all instances predicted as class 0 were actually class 0.

**For class 1 (label 1):** Precision is 99%, indicating that 99% of the instances predicted as class 1 were actually class 1.

**Recall (Sensitivity):** Recall measures the ability of the classifier to find all positive instances. It is the ratio of correctly predicted positive observations to all actual positives.

**For class 0 (label 0):** Recall is 100%, meaning that the classifier correctly identified all instances of class 0.

**For class 1 (label 1):** Recall is 53%, indicating that only 53% of the actual instances of class 1 were correctly identified by the classifier.
F1-Score: The F1-score is the harmonic mean of precision and recall. It provides a single score that balances both precision and recall.

**For class 0 (label 0):** F1-score is 100%.

**For class 1 (label 1):** F1-score is 69%, which indicates the harmonic mean of precision and recall for class 1.
Support: Support is the number of actual occurrences of the class in the specified dataset.

**For class 0 (label 0):** There are 86,160 instances of class 0 in the testing dataset.

**For class 1 (label 1):** There are 455 instances of class 1 in the testing dataset.

**AUC-ROC Score:**
The AUC-ROC score is a measure of the area under the receiver operating characteristic (ROC) curve. It provides an aggregate measure of performance across all classification thresholds.

In this case, the AUC-ROC score is approximately 0.766, indicating moderate discriminative ability of the model between positive and negative instances.
Overall, while the model demonstrates high accuracy and precision for class 0, it struggles with class 1. This is evident from the lower recall and F1-score for class 1, suggesting that the model may not generalize well to class 1 instances. Further tuning or balancing techniques may be required to improve its performance on detecting class 1 instances.

**Oversampling with SMOTE**

**SMOTE (Synthetic Minority Over-sampling Technique)**
SMOTE (Synthetic Minority Over-sampling Technique) is a popular method for oversampling the minority class by generating synthetic samples that are similar to the existing minority class samples.
You can use the SMOTE class from the imbalanced-learn library to perform oversampling.
Undersampling:

**Undersampling**
Undersampling involves randomly removing samples from the majority class to balance the class distribution.
You can use the RandomUnderSampler class from the imbalanced-learn library to perform undersampling.

**SMOTE (Synthetic Minority Over-sampling Technique)**

**Model Training with SMOTE Oversampling**

In [None]:
pip install imbalanced-learn



In [None]:
pip install category_encoders



In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, roc_auc_score
from category_encoders import TargetEncoder
from imblearn.over_sampling import SMOTE

# df Label as the target variable
X = df.drop(columns=['Label'])  # Features
y = df['Label']  # Target variable

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Identify categorical columns
categorical_cols = X_train.select_dtypes(include=['object']).columns

# Target encode categorical variables
target_encoder = TargetEncoder(cols=categorical_cols)
X_train_encoded = target_encoder.fit_transform(X_train, y_train)

# Initialize the SMOTE oversampler
smote = SMOTE(random_state=42)

# Perform SMOTE oversampling on the target encoded training data
X_train_smote, y_train_smote = smote.fit_resample(X_train_encoded, y_train)

# Fit the Random Forest classifier on the SMOTE oversampled data
rf_classifier_smote = RandomForestClassifier(random_state=42)
rf_classifier_smote.fit(X_train_smote, y_train_smote)

# Target encode categorical variables in the test data using the same encoder
X_test_encoded = target_encoder.transform(X_test)

# Make predictions on the test data
y_pred_smote = rf_classifier_smote.predict(X_test_encoded)

# Calculate the AUC-ROC score
auc_roc_smote = roc_auc_score(y_test, y_pred_smote)

# Evaluate the model
print("Accuracy with SMOTE:", accuracy_score(y_test, y_pred_smote))
print("\nClassification Report with SMOTE:\n", classification_report(y_test, y_pred_smote))
print("AUC-ROC Score with SMOTE:", auc_roc_smote)

Accuracy with SMOTE: 0.996917393061248

Classification Report with SMOTE:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00     86160
           1       1.00      0.41      0.58       455

    accuracy                           1.00     86615
   macro avg       1.00      0.71      0.79     86615
weighted avg       1.00      1.00      1.00     86615

AUC-ROC Score with SMOTE: 0.7065934065934065


In [None]:
from imblearn.over_sampling import SMOTE
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, roc_auc_score
from category_encoders import TargetEncoder
from imblearn.under_sampling import RandomUnderSampler

# Assuming 'Label' is the target variable
X = df.drop(columns=['Label'])  # Features
y = df['Label']  # Target variable

# Separate majority and minority classes
majority_class = df[df['Label'] == 0]
minority_class = df[df['Label'] == 1]

# Downsample the majority class to have the same number of samples as the minority class
majority_class_downsampled = majority_class.sample(n=len(minority_class), random_state=42)

# Combine the minority class with the downsampled majority class
df_balanced = pd.concat([majority_class_downsampled, minority_class])

# Shuffle the dataframe
df_balanced = df_balanced.sample(frac=1, random_state=42).reset_index(drop=True)

# Split the balanced data into features and target variable
X = df_balanced.drop(columns=['Label'])
y = df_balanced['Label']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Identify categorical columns
categorical_cols = X_train.select_dtypes(include=['object']).columns

# Target encode categorical variables
target_encoder = TargetEncoder(cols=categorical_cols)
X_train_encoded = target_encoder.fit_transform(X_train, y_train)

# Initialize the SMOTE oversampler
smote = SMOTE(random_state=42)

# Perform SMOTE oversampling on the downsampled training data
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_encoded, y_train)

# Initialize the Random Forest classifier
rf_classifier = RandomForestClassifier(random_state=42)

# Train the Random Forest model on the resampled training data
rf_classifier.fit(X_train_resampled, y_train_resampled)

# Target encode categorical variables in the test data using the same encoder
X_test_encoded = target_encoder.transform(X_test)

# Make predictions on the test data
y_pred = rf_classifier.predict(X_test_encoded)

# Calculate the AUC-ROC score
auc_roc = roc_auc_score(y_test, y_pred)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("SMOTE AUC-ROC Score:", auc_roc)

Accuracy: 0.6677927927927928

Classification Report:
               precision    recall  f1-score   support

           0       0.60      1.00      0.75       444
           1       1.00      0.34      0.50       444

    accuracy                           0.67       888
   macro avg       0.80      0.67      0.63       888
weighted avg       0.80      0.67      0.63       888

AUC-ROC Score: 0.6677927927927928


**Classification Report**

**Precision**
 Precision is the ratio of correctly predicted positive observations to the total predicted positives. For class 1 (the minority class), precision is 1.00, meaning that when the model predicts a positive outcome (class 1), it is correct 100% of the time. For class 0, precision is also 1.00, indicating perfect precision for the majority class.

**Recall (Sensitivity)**
Recall is the ratio of correctly predicted positive observations to the all observations in actual class. For class 1, recall is 0.41, meaning that the model only identifies 41% of the actual positives. For class 0, recall is 1.00, indicating that the model identifies all the actual negatives.

**F1-score**
 The F1-score is the harmonic mean of precision and recall. It provides a balance between precision and recall. For class 1, the F1-score is 0.58, which is relatively low compared to class 0's F1-score of 1.00. This indicates that the model's performance on the minority class is not as good as on the majority class.

Support: Support is the number of actual occurrences of the class in the specified dataset. In this case, there are 455 instances of class 1 and 86,160 instances of class 0.

**AUC-ROC Score**
The AUC-ROC (Area Under the Receiver Operating Characteristic) score measures the area under the ROC curve. The ROC curve is a graphical representation of the true positive rate (sensitivity) against the false positive rate (1 - specificity) for different threshold values. AUC-ROC score ranges from 0 to 1, where a score closer to 1 indicates a better-performing model.

In this case, the AUC-ROC score is 0.7066, which is relatively low. This suggests that the model's ability to distinguish between the positive and negative classes is not very strong.

**Summary**
While the model achieves high accuracy (99.69%) due to the imbalanced nature of the dataset, it is not performing well in terms of correctly identifying the minority class (class 1). The low recall and F1-score for class 1 indicate that the model is not effectively capturing the patterns associated with this class.

The AUC-ROC score further confirms the model's suboptimal performance in distinguishing between the two classes. A score of 0.7066 suggests that there is room for improvement in the model's predictive ability, particularly in handling the minority class.

**Recommendations**
Since the dataset is imbalanced, consider exploring advanced techniques such as adjusting class weights, using different resampling methods, or trying different algorithms specifically designed to handle imbalanced data.

Further feature engineering, hyperparameter tuning, and model selection might also help improve the model's performance.

It's essential to monitor the model's performance closely and iterate on different strategies to achieve a better balance between precision, recall, and overall predictive accuracy, especially for imbalanced datasets.

**Borderline-SMOTE**

In [None]:
from imblearn.combine import SMOTEENN

# Initialize the Borderline-SMOTE oversampler
borderline_smote = BorderlineSMOTE(random_state=42)

# Perform Borderline-SMOTE oversampling on the target encoded training data
X_train_resampled, y_train_resampled = borderline_smote.fit_resample(X_train_encoded, y_train)

# Fit the Random Forest classifier on the resampled data
rf_classifier_resampled = RandomForestClassifier(random_state=42)
rf_classifier_resampled.fit(X_train_resampled, y_train_resampled)

# Make predictions on the test data
y_pred_resampled = rf_classifier_resampled.predict(X_test_encoded)

# Calculate the AUC-ROC score
auc_roc_resampled = roc_auc_score(y_test, y_pred_resampled)

# Evaluate the model
print("Accuracy with Borderline-SMOTE:", accuracy_score(y_test, y_pred_resampled))
print("\nClassification Report with Borderline-SMOTE:\n", classification_report(y_test, y_pred_resampled))
print("AUC-ROC Score with Borderline-SMOTE:", auc_roc_resampled)

Accuracy with Borderline-SMOTE: 0.996917393061248

Classification Report with Borderline-SMOTE:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00     86160
           1       1.00      0.41      0.58       455

    accuracy                           1.00     86615
   macro avg       1.00      0.71      0.79     86615
weighted avg       1.00      1.00      1.00     86615

AUC-ROC Score with Borderline-SMOTE: 0.7065934065934065


**SMOTE-ENN (Synthetic Minority Over-sampling Technique and Edited Nearest Neighbors)**

In [None]:
from imblearn.combine import SMOTEENN

# Initialize the SMOTE-ENN oversampler
smote_enn = SMOTEENN(random_state=42)

# Perform SMOTE-ENN oversampling on the target encoded training data
X_train_resampled, y_train_resampled = smote_enn.fit_resample(X_train_encoded, y_train)

# Fit the Random Forest classifier on the resampled data
rf_classifier_resampled = RandomForestClassifier(random_state=42)
rf_classifier_resampled.fit(X_train_resampled, y_train_resampled)

# Make predictions on the test data
y_pred_resampled = rf_classifier_resampled.predict(X_test_encoded)

# Calculate the AUC-ROC score
auc_roc_resampled = roc_auc_score(y_test, y_pred_resampled)

# Evaluate the model
print("Accuracy with SMOTE-ENN:", accuracy_score(y_test, y_pred_resampled))
print("\nClassification Report with SMOTE-ENN:\n", classification_report(y_test, y_pred_resampled))
print("AUC-ROC Score with SMOTE-ENN:", auc_roc_resampled)

Accuracy with SMOTE-ENN: 0.996917393061248

Classification Report with SMOTE-ENN:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00     86160
           1       1.00      0.41      0.58       455

    accuracy                           1.00     86615
   macro avg       1.00      0.71      0.79     86615
weighted avg       1.00      1.00      1.00     86615

AUC-ROC Score with SMOTE-ENN: 0.7065934065934065


**Model Training with Random Undersampling**

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, roc_auc_score
from category_encoders import TargetEncoder
from imblearn.under_sampling import RandomUnderSampler
import pandas as pd

# Assuming 'Label' is the target variable
X = df.drop(columns=['Label'])  # Features
y = df['Label']  # Target variable

# Identify categorical columns
categorical_cols = X.select_dtypes(include=['object']).columns

# Target encode categorical variables
target_encoder = TargetEncoder(cols=categorical_cols)
X_encoded = target_encoder.fit_transform(X, y)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)

# Perform random undersampling to balance class distribution
undersampler = RandomUnderSampler(random_state=42)
X_train_undersampled, y_train_undersampled = undersampler.fit_resample(X_train, y_train)

# Initialize the Random Forest classifier
rf_classifier_undersampled = RandomForestClassifier(random_state=42)

# Train the Random Forest model on the training data with random undersampling
rf_classifier_undersampled.fit(X_train_undersampled, y_train_undersampled)

# Make predictions on the testing data
y_pred_undersampled = rf_classifier_undersampled.predict(X_test)

# Calculate the AUC-ROC score
auc_roc_undersampled = roc_auc_score(y_test, y_pred_undersampled)

# Evaluate the model
print("Accuracy with Random Undersampling:", accuracy_score(y_test, y_pred_undersampled))
print("\nClassification Report with Random Undersampling:\n", classification_report(y_test, y_pred_undersampled))
print("AUC-ROC Score with Random Undersampling:", auc_roc_undersampled)

Accuracy with Random Undersampling: 1.0

Classification Report with Random Undersampling:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00     86160
           1       1.00      1.00      1.00       455

    accuracy                           1.00     86615
   macro avg       1.00      1.00      1.00     86615
weighted avg       1.00      1.00      1.00     86615

AUC-ROC Score with Random Undersampling: 1.0


In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, roc_auc_score
from category_encoders import TargetEncoder
from imblearn.under_sampling import RandomUnderSampler

# Assuming 'Label' is the target variable
X = df.drop(columns=['Label'])  # Features
y = df['Label']  # Target variable

# Separate majority and minority classes
majority_class = df[df['Label'] == 0]
minority_class = df[df['Label'] == 1]

# Sample the majority class to have the same number of samples as the minority class
majority_class_sampled = majority_class.sample(n=len(minority_class), random_state=42)

# Combine the minority class with the sampled majority class
df_balanced = pd.concat([majority_class_sampled, minority_class])

# Shuffle the dataframe
df_balanced = df_balanced.sample(frac=1, random_state=42).reset_index(drop=True)

# Split the balanced data into features and target variable
X = df_balanced.drop(columns=['Label'])
y = df_balanced['Label']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Identify categorical columns
categorical_cols = X_train.select_dtypes(include=['object']).columns

# Target encode categorical variables
target_encoder = TargetEncoder(cols=categorical_cols)
X_train_encoded = target_encoder.fit_transform(X_train, y_train)

# Specify the desired ratio of minority class to majority class after undersampling
sampling_strategy = 1.0  # Equalize the classes

# Initialize the RandomUnderSampler
undersampler = RandomUnderSampler(sampling_strategy=sampling_strategy, random_state=42)

# Perform random undersampling on the target encoded training data
X_train_resampled, y_train_resampled = undersampler.fit_resample(X_train_encoded, y_train)

# Initialize the Random Forest classifier
rf_classifier = RandomForestClassifier(random_state=42)

# Train the Random Forest model on the training data with undersampling
rf_classifier.fit(X_train_resampled, y_train_resampled)

# Target encode categorical variables in the test data using the same encoder
X_test_encoded = target_encoder.transform(X_test)

# Make predictions on the test data
y_pred = rf_classifier.predict(X_test_encoded)

# Calculate the AUC-ROC score
auc_roc = roc_auc_score(y_test, y_pred)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("AUC-ROC Score:", auc_roc)

Accuracy: 0.6677927927927928

Classification Report:
               precision    recall  f1-score   support

           0       0.60      1.00      0.75       444
           1       1.00      0.34      0.50       444

    accuracy                           0.67       888
   macro avg       0.80      0.67      0.63       888
weighted avg       0.80      0.67      0.63       888

AUC-ROC Score: 0.6677927927927928


In [12]:
pip install category_encoders

Collecting category_encoders
  Downloading category_encoders-2.6.3-py2.py3-none-any.whl (81 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.9/81.9 kB[0m [31m1.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: category_encoders
Successfully installed category_encoders-2.6.3


In [13]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, roc_auc_score
from category_encoders import TargetEncoder
from imblearn.under_sampling import RandomUnderSampler

# Load the data
df3 = pd.read_csv("/content/drive/MyDrive/SMU TMLP/enron_clean_V2.csv")

# Assuming 'Label' is the target variable
X = df3.drop(columns=['Label'])  # Features
y = df3['Label']  # Target variable

# Separate majority and minority classes
majority_class = df3[df3['Label'] == 0]
minority_class = df3[df3['Label'] == 1]

# Sample the majority class to have the same number of samples as the minority class
majority_class_sampled = majority_class.sample(n=len(minority_class), random_state=42)

# Combine the minority class with the sampled majority class
df_balanced = pd.concat([majority_class_sampled, minority_class])

# Shuffle the dataframe
df_balanced = df_balanced.sample(frac=1, random_state=42).reset_index(drop=True)

# Split the balanced data into features and target variable
X = df_balanced.drop(columns=['Label'])
y = df_balanced['Label']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Identify categorical columns
categorical_cols = X_train.select_dtypes(include=['object']).columns

# Target encode categorical variables
target_encoder = TargetEncoder(cols=categorical_cols)
X_train_encoded = target_encoder.fit_transform(X_train, y_train)
X_test_encoded = target_encoder.transform(X_test)

# Specify the desired ratio of minority class to majority class after undersampling
sampling_strategy = 1.0  # Equalize the classes

# Initialize the RandomUnderSampler
undersampler = RandomUnderSampler(sampling_strategy=sampling_strategy, random_state=42)

# Perform random undersampling on the target encoded training data
X_train_resampled, y_train_resampled = undersampler.fit_resample(X_train_encoded, y_train)

# Define hyperparameters grid for RandomizedSearchCV
param_distributions = {
    'n_estimators': [100, 200, 10],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Initialize the Random Forest classifier
rf_classifier = RandomForestClassifier(random_state=42)

# Perform RandomizedSearchCV for hyperparameter tuning
random_search = RandomizedSearchCV(estimator=rf_classifier, param_distributions=param_distributions, n_iter=10, cv=5, scoring='accuracy', random_state=42)
random_search.fit(X_train_resampled, y_train_resampled)

# Get the best parameters
best_params = random_search.best_params_

# Initialize the Random Forest classifier with the best parameters
rf_classifier_best = RandomForestClassifier(**best_params, random_state=42)

# Train the Random Forest model on the training data with undersampling
rf_classifier_best.fit(X_train_resampled, y_train_resampled)

# Make predictions on the test data
y_pred = rf_classifier_best.predict(X_test_encoded)

# Calculate the AUC-ROC score
auc_roc = roc_auc_score(y_test, y_pred)

# Evaluate the model
print("Best Parameters:", best_params)
print("\nAccuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("AUC-ROC Score:", auc_roc)


Best Parameters: {'n_estimators': 100, 'min_samples_split': 5, 'min_samples_leaf': 1, 'max_depth': 10}

Accuracy: 0.6722972972972973

Classification Report:
               precision    recall  f1-score   support

           0       0.60      1.00      0.75       444
           1       1.00      0.34      0.51       444

    accuracy                           0.67       888
   macro avg       0.80      0.67      0.63       888
weighted avg       0.80      0.67      0.63       888

AUC-ROC Score: 0.6722972972972973


In [14]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, roc_auc_score
from category_encoders import TargetEncoder
from imblearn.under_sampling import RandomUnderSampler

# Load the data
df3 = pd.read_csv("/content/drive/MyDrive/SMU TMLP/enron_clean_V2.csv")

# Assuming 'Label' is the target variable
X = df3.drop(columns=['Label'])  # Features
y = df3['Label']  # Target variable

# Separate majority and minority classes
majority_class = df3[df3['Label'] == 0]
minority_class = df3[df3['Label'] == 1]

# Sample the majority class to have the same number of samples as the minority class
majority_class_sampled = majority_class.sample(n=len(minority_class), random_state=42)

# Combine the minority class with the sampled majority class
df_balanced = pd.concat([majority_class_sampled, minority_class])

# Shuffle the dataframe
df_balanced = df_balanced.sample(frac=1, random_state=42).reset_index(drop=True)

# Split the balanced data into features and target variable
X = df_balanced.drop(columns=['Label'])
y = df_balanced['Label']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Identify categorical columns
categorical_cols = X_train.select_dtypes(include=['object']).columns

# Target encode categorical variables
target_encoder = TargetEncoder(cols=categorical_cols)
X_train_encoded = target_encoder.fit_transform(X_train, y_train)
X_test_encoded = target_encoder.transform(X_test)

# Specify the desired ratio of minority class to majority class after undersampling
sampling_strategy = 1.0  # Equalize the classes

# Initialize the RandomUnderSampler
undersampler = RandomUnderSampler(sampling_strategy=sampling_strategy, random_state=42)

# Perform random undersampling on the target encoded training data
X_train_resampled, y_train_resampled = undersampler.fit_resample(X_train_encoded, y_train)

# Define hyperparameters grid for RandomizedSearchCV
param_distributions = {
    'n_estimators': [200, 300, 10],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Initialize the Random Forest classifier
rf_classifier = RandomForestClassifier(random_state=42)

# Perform RandomizedSearchCV for hyperparameter tuning
random_search = RandomizedSearchCV(estimator=rf_classifier, param_distributions=param_distributions, n_iter=10, cv=5, scoring='accuracy', random_state=42)
random_search.fit(X_train_resampled, y_train_resampled)

# Get the best parameters
best_params = random_search.best_params_

# Initialize the Random Forest classifier with the best parameters
rf_classifier_best = RandomForestClassifier(**best_params, random_state=42)

# Train the Random Forest model on the training data with undersampling
rf_classifier_best.fit(X_train_resampled, y_train_resampled)

# Make predictions on the test data
y_pred = rf_classifier_best.predict(X_test_encoded)

# Calculate the AUC-ROC score
auc_roc = roc_auc_score(y_test, y_pred)

# Evaluate the model
print("Best Parameters:", best_params)
print("\nAccuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("AUC-ROC Score:", auc_roc)


Best Parameters: {'n_estimators': 200, 'min_samples_split': 5, 'min_samples_leaf': 1, 'max_depth': 10}

Accuracy: 0.6677927927927928

Classification Report:
               precision    recall  f1-score   support

           0       0.60      1.00      0.75       444
           1       1.00      0.34      0.50       444

    accuracy                           0.67       888
   macro avg       0.80      0.67      0.63       888
weighted avg       0.80      0.67      0.63       888

AUC-ROC Score: 0.6677927927927928


In [15]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, roc_auc_score
from category_encoders import TargetEncoder
from imblearn.under_sampling import RandomUnderSampler

# Load the data
df3 = pd.read_csv("/content/drive/MyDrive/SMU TMLP/enron_clean_V2.csv")

# Assuming 'Label' is the target variable
X = df3.drop(columns=['Label'])  # Features
y = df3['Label']  # Target variable

# Separate majority and minority classes
majority_class = df3[df3['Label'] == 0]
minority_class = df3[df3['Label'] == 1]

# Sample the majority class to have the same number of samples as the minority class
majority_class_sampled = majority_class.sample(n=len(minority_class), random_state=42)

# Combine the minority class with the sampled majority class
df_balanced = pd.concat([majority_class_sampled, minority_class])

# Shuffle the dataframe
df_balanced = df_balanced.sample(frac=1, random_state=42).reset_index(drop=True)

# Split the balanced data into features and target variable
X = df_balanced.drop(columns=['Label'])
y = df_balanced['Label']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Identify categorical columns
categorical_cols = X_train.select_dtypes(include=['object']).columns

# Target encode categorical variables
target_encoder = TargetEncoder(cols=categorical_cols)
X_train_encoded = target_encoder.fit_transform(X_train, y_train)
X_test_encoded = target_encoder.transform(X_test)

# Specify the desired ratio of minority class to majority class after undersampling
sampling_strategy = 1.0  # Equalize the classes

# Initialize the RandomUnderSampler
undersampler = RandomUnderSampler(sampling_strategy=sampling_strategy, random_state=42)

# Perform random undersampling on the target encoded training data
X_train_resampled, y_train_resampled = undersampler.fit_resample(X_train_encoded, y_train)

# Define hyperparameters grid for RandomizedSearchCV
param_distributions = {
    'n_estimators': [300, 400, 10],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Initialize the Random Forest classifier
rf_classifier = RandomForestClassifier(random_state=42)

# Perform RandomizedSearchCV for hyperparameter tuning
random_search = RandomizedSearchCV(estimator=rf_classifier, param_distributions=param_distributions, n_iter=10, cv=5, scoring='accuracy', random_state=42)
random_search.fit(X_train_resampled, y_train_resampled)

# Get the best parameters
best_params = random_search.best_params_

# Initialize the Random Forest classifier with the best parameters
rf_classifier_best = RandomForestClassifier(**best_params, random_state=42)

# Train the Random Forest model on the training data with undersampling
rf_classifier_best.fit(X_train_resampled, y_train_resampled)

# Make predictions on the test data
y_pred = rf_classifier_best.predict(X_test_encoded)

# Calculate the AUC-ROC score
auc_roc = roc_auc_score(y_test, y_pred)

# Evaluate the model
print("Best Parameters:", best_params)
print("\nAccuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("AUC-ROC Score:", auc_roc)

Best Parameters: {'n_estimators': 300, 'min_samples_split': 5, 'min_samples_leaf': 1, 'max_depth': 10}

Accuracy: 0.668918918918919

Classification Report:
               precision    recall  f1-score   support

           0       0.60      1.00      0.75       444
           1       1.00      0.34      0.51       444

    accuracy                           0.67       888
   macro avg       0.80      0.67      0.63       888
weighted avg       0.80      0.67      0.63       888

AUC-ROC Score: 0.6689189189189189


**Perform Hyper-parameter Tuning**

In [None]:
df.columns

Index(['Year', 'Month', 'Day', 'Hour', 'Day_of_week',
       'sentiment_score_compound', 'sentiment_score_positive',
       'sentiment_score_neutral', 'sentiment_score_negative', 'Sender-Type',
       'Unique-Mails-From-Sender', 'Contains-Reply-Forwards', 'Subject_lemma',
       'Body_lemma', 'Subject_num_sentences', 'Subject_median_chars_per_word',
       'Subject_median_words_per_sentence', 'Subject_uppercase_ratio',
       'Subject_punctuation_ratio', 'Subject_typo_ratio',
       'Subject_special_chars_ratio', 'Body_num_sentences',
       'Body_median_chars_per_word', 'Body_median_words_per_sentence',
       'Body_uppercase_ratio', 'Body_punctuation_ratio', 'Body_typo_ratio',
       'Body_special_chars_ratio', 'Bcc_count', 'Label',
       'Body_lexical_complexity', 'Body_lemma_dominant_topic',
       'Subject_lemma_dominant_topic', 'VBZ', 'VBD', 'CD', 'PRP', 'VB', 'VBN',
       'RB', 'TO', 'RP', 'VBG', 'NNP', 'VBP', 'JJ', 'NNS', 'IN', 'DT', 'CC',
       'JJS', 'WP', 'NN', 'JJR', 'Bc

In [None]:
df

Unnamed: 0,Year,Month,Day,Hour,Day_of_week,sentiment_score_compound,sentiment_score_positive,sentiment_score_neutral,sentiment_score_negative,Sender-Type,Unique-Mails-From-Sender,Contains-Reply-Forwards,Subject_lemma,Body_lemma,Subject_num_sentences,Subject_median_chars_per_word,Subject_median_words_per_sentence,Subject_uppercase_ratio,Subject_punctuation_ratio,Subject_typo_ratio,Subject_special_chars_ratio,Body_num_sentences,Body_median_chars_per_word,Body_median_words_per_sentence,Body_uppercase_ratio,Body_punctuation_ratio,Body_typo_ratio,Body_special_chars_ratio,Bcc_count,Label,Body_lexical_complexity,Body_lemma_dominant_topic,Subject_lemma_dominant_topic,VBZ,VBD,CD,PRP,VB,VBN,RB,TO,RP,VBG,NNP,VBP,JJ,NNS,IN,DT,CC,JJS,WP,NN,JJR,Bcc,Body,Body_num_words,Body_pos_frequency,Body_word2vec_features,From,Subject_lemma_doc2vec_features,Subject_num_words,X-Origin
0,2000,11,16,17,3,0.9616,0.226,0.708,0.066,0,0.003281,0,status,status john : I be not really sure what happen...,0.0,0.063158,0.027027,0.166667,0.000000,0.000000,0.000000,0.017204,0.052632,0.012067,0.028716,0.055743,0.036036,0.055743,0.000000,0,0.923077,11,5,1,3,2,10,7,2,9,4,2,1,9,8,8,1,12,8,2,1,1,13,0,,Status John: I'm not really sure what happened...,0.018390,"{'NNP': 9, ':': 1, 'PRP': 10, 'VBP': 8, 'RB': ...",[ 0.71445566 -2.3261511 1.2993109 1.448487...,msagel@home.com,[-0.14599033 -0.01694362 0.00184981 -0.065501...,0.020408,Arnold-J
1,2000,12,8,13,4,0.5940,0.074,0.926,0.000,0,0.000729,0,re : summer inverse,re : summer inverse I suck - hope you ve make ...,0.0,0.089474,0.054054,0.000000,0.055556,0.000000,0.055556,0.004301,0.052632,0.018996,0.000000,0.030508,0.140351,0.030508,0.000000,0,0.947368,3,11,0,1,1,1,4,0,4,1,0,1,0,5,8,3,3,4,2,0,0,15,1,,re:summer inverses i suck-hope youve made more...,0.009650,"{'NN': 15, ':': 1, 'NNS': 3, 'VBP': 5, 'VBD': ...",[ 0.95756644 -2.114178 0.8945547 1.367256...,slafontaine@globalp.com,[-0.0638589 0.00566847 0.01042745 -0.022350...,0.040816,Arnold-J
2,2001,5,15,16,1,-0.3923,0.036,0.918,0.045,0,0.000547,0,the wti bullet swap contract,"the wti bullet swap contract hi , follow the e...",0.0,0.042105,0.135135,0.172414,0.000000,0.200000,0.000000,0.004301,0.052632,0.062007,0.074792,0.042475,0.088889,0.042475,0.000000,0,0.775862,4,9,1,7,21,6,7,5,4,5,0,8,43,4,6,10,9,9,1,0,1,26,0,,"The WTI Bullet swap contracts Hi, Following th...",0.031500,"{'DT': 9, 'NNP': 43, 'NN': 26, 'NNS': 10, ',':...",[ 0.4515546 -1.7378796 0.4099782 1.249512...,iceoperations@intcx.com,[-1.24904305e-01 -9.08257589e-02 -9.20014083e-...,0.102041,Arnold-J
3,2000,12,12,7,1,-0.4767,0.000,0.829,0.171,0,0.001641,1,fwd : nytimes.com article : suspend rabbi quit...,fwd : nytimes.com article : suspend rabbi quit...,0.0,0.084211,0.216216,0.149254,0.044776,0.111111,0.044776,0.006452,0.111842,0.004301,0.076923,0.117949,0.310345,0.117949,0.000000,0,0.617647,2,5,0,1,1,0,1,1,0,1,0,0,10,0,2,0,1,0,0,0,0,10,0,,Fwd: NYTimes.com Article: Suspended Rabbi Quit...,0.002913,"{'NN': 10, ':': 6, 'JJ': 2, 'VBN': 1, 'NNP': 1...",[-1.03076708e+00 -1.98776089e-02 8.55472207e-...,klarnold@flash.net,[-0.08218867 -0.00136871 -0.05490851 0.002315...,0.163265,Arnold-J
4,2001,5,15,6,1,-0.0772,0.058,0.893,0.050,0,0.064163,0,daily chart and matrix as hot link 5/15,daily chart and matrix as hot link 5/15 the in...,0.0,0.047368,0.216216,0.000000,0.023256,0.000000,0.023256,0.017204,0.052632,0.022700,0.052726,0.035746,0.010101,0.035746,0.000000,0,0.877143,0,2,2,3,5,9,11,7,4,6,0,1,45,5,14,17,23,16,11,0,0,19,0,,daily charts and matrices as hot links 5/15 Th...,0.034596,"{'JJ': 14, 'NNS': 17, 'CC': 11, 'IN': 23, 'CD'...",[ 0.4489542 -2.57727 1.3087538 1.816294...,soblander@carrfut.com,[-0.10805397 0.0167327 -0.00174666 -0.057156...,0.163265,Arnold-J
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
433069,2001,9,21,18,4,0.0000,0.000,1.000,0.000,1,0.001276,0,review board book w / rebecca \n c./billb / d...,review board book w / rebecca c./billb / daveg...,0.0,0.063158,0.135135,0.229885,0.114943,0.571429,0.114943,0.002151,0.078947,0.010753,0.201058,0.126984,0.264706,0.126984,0.000000,0,0.942308,10,8,0,1,3,0,0,0,0,0,0,0,12,0,1,0,0,0,0,0,0,1,0,,Review Board Books w/Rebecca C./BillB/DaveG/Mi...,0.003642,"{'NNP': 12, 'VBD': 1, 'WRB': 2, ':': 2, ',': 2...",[ 0.29830086 -0.19684805 0.08153074 -0.267179...,bobbie.campbell@enron.com,[-0.30864877 -0.08117718 -0.10863884 -0.230373...,0.102041,Buy-R
433070,2001,9,4,15,1,0.0000,0.000,1.000,0.000,1,0.008385,0,audit committee material meeting,audit committee material meet two meeting have...,0.0,0.084211,0.108108,0.090909,0.000000,0.000000,0.000000,0.002151,0.078947,0.024731,0.089744,0.028846,0.060000,0.028846,0.004386,0,0.818116,11,4,0,0,6,0,1,2,0,1,0,1,21,1,1,3,2,2,2,0,0,3,0,"dortha.gray@enron.com, k..heathman@enron.com, ...",Audit Committee Materials meeting Two meetings...,0.008376,"{'NNP': 21, 'VBG': 1, 'CD': 6, 'NNS': 3, 'VBP'...",[ 0.31306246 -1.7426066 0.68086064 0.965420...,sharron.westbrook@enron.com,[-0.10764003 -0.00839344 -0.01368004 -0.056418...,0.081633,Buy-R
433071,2001,11,5,9,0,0.9758,0.118,0.851,0.031,1,0.011666,0,credit story,"credit story rick / bill / david , generally ,...",0.0,0.057895,0.054054,0.166667,0.000000,0.000000,0.000000,0.017204,0.052632,0.029630,0.028217,0.025465,0.011858,0.025465,0.000000,0,0.896825,11,4,6,0,0,22,16,3,15,15,2,12,14,17,18,20,23,19,10,0,0,21,3,,"Credit Story Rick/Bill/David, Generally, we ha...",0.045157,"{'NNP': 14, ',': 14, 'PRP': 22, 'VBP': 17, 'VB...",[ 0.89823025 -3.3463628 1.566664 2.430928...,ted.murphy@enron.com,[-2.0059045e-02 -2.3803629e-03 -2.4081659e-02 ...,0.040816,Buy-R
433072,2001,10,17,23,2,0.9658,0.067,0.905,0.028,1,0.012213,0,commodity group limit issue,commodity group limit issue in addition to the...,0.0,0.052632,0.108108,0.148148,0.000000,0.000000,0.000000,0.040860,0.052632,0.029194,0.064918,0.038695,0.045139,0.038695,0.002193,0,0.902439,4,8,19,11,25,21,22,14,34,12,2,6,123,6,14,12,57,63,22,1,0,61,0,"frank.hayden@enron.com, ted.murphy@enron.com",Commodity Group Limit Issue In addition to the...,0.098871,"{'NNP': 123, 'IN': 57, 'NN': 61, 'TO': 12, 'DT...",[ 4.9536127e-01 -2.7233346e+00 1.1807437e+00 ...,cassandra.schultz@enron.com,[ 0.1241836 -0.03928145 -0.00246533 0.019151...,0.081633,Buy-R


In [10]:
pip install tqdm



In [None]:
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
import pandas as pd

# Assuming 'Label' is the target variable
X = df.drop(columns=['Label'])  # Features
y = df['Label']  # Target variable

# Perform SMOTE oversampling to balance class distribution
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X, y)

# Perform random undersampling to balance class distribution
undersampler = RandomUnderSampler(random_state=42)
X_train_undersampled, y_train_undersampled = undersampler.fit_resample(X, y)

# Check class distribution after SMOTE oversampling
print("Class distribution after SMOTE oversampling:")
print(pd.Series(y_train_smote).value_counts())

# Check class distribution after random undersampling
print("\nClass distribution after random undersampling:")
print(pd.Series(y_train_undersampled).value_counts())

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error
import numpy as np

# Assuming df contains your features and labels
features = df.drop(columns=['Label'])
labels = df['Label']

# Split the data into training and testing sets
train_features, test_features, train_labels, test_labels = train_test_split(features, labels, test_size=0.2, random_state=42)

# Define the parameter grid for hyperparameter tuning
param_grid = {
    'n_estimators': np.arange(10, 501, 50),  # Range from 10 to 500 with step size 50
    'max_depth': [None] + list(np.arange(10, 31, 10)),  # Include None and range from 10 to 30 with step size 10
    'min_samples_split': np.arange(2, 11, 3),  # Range from 2 to 10 with step size 3
    'min_samples_leaf': np.arange(1, 5)  # Range from 1 to 4
}

# Initialize the grid search with RandomForestRegressor
grid_search = GridSearchCV(estimator=RandomForestRegressor(random_state=42),
                           param_grid=param_grid,
                           cv=5,
                           scoring='neg_mean_squared_error',
                           verbose=2,
                           n_jobs=-1)  # Parallelize computation using all available CPU cores

# Fit the grid search to the training data
grid_search.fit(train_features, train_labels)

# Get the best model and its hyperparameters
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

# Make predictions using the best model
best_predictions = best_model.predict(test_features)

# Calculate evaluation metrics: adjusted R squared, RMSE, and MAPE
def adjusted_r_squared(y_true, y_pred, n_features):
    r_squared = 1 - (np.sum((y_true - y_pred) ** 2) / np.sum((y_true - np.mean(y_true)) ** 2))
    adjusted_r_squared = 1 - ((1 - r_squared) * (len(y_true) - 1) / (len(y_true) - n_features - 1))
    return adjusted_r_squared

def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

adj_r_squared = adjusted_r_squared(test_labels, best_predictions, len(features.columns))
rmse_value = rmse(test_labels, best_predictions)
mape_value = mean_absolute_percentage_error(test_labels, best_predictions)

# Display evaluation metrics
print("Best Model Hyperparameters:", best_params)
print("Adjusted R squared:", adj_r_squared)
print("RMSE:", rmse_value)
print("Mean Absolute Percentage Error:", mape_value)

Fitting 5 folds for each of 480 candidates, totalling 2400 fits


**Specify ranges for each hyperparameter instead of specific values**

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error
import numpy as np

# Assuming df contains your features and labels
features = df.drop(columns=['Label'])
labels = df['Label']

# Split the data into training and testing sets
train_features, test_features, train_labels, test_labels = train_test_split(features, labels, test_size=0.2, random_state=42)

# Define the parameter grid for hyperparameter tuning
param_grid = {
    'n_estimators': np.arange(10, 501, 50),  # Range from 10 to 500 with step size 50
    'max_depth': [None] + list(np.arange(10, 31, 10)),  # Include None and range from 10 to 30 with step size 10
    'min_samples_split': np.arange(2, 11, 3),  # Range from 2 to 10 with step size 3
    'min_samples_leaf': np.arange(1, 5)  # Range from 1 to 4
}

# Initialize the grid search with RandomForestRegressor
grid_search = GridSearchCV(estimator=RandomForestRegressor(random_state=42),
                           param_grid=param_grid,
                           cv=5,
                           scoring='neg_mean_squared_error',
                           verbose=2,
                           n_jobs=-1)

# Fit the grid search to the training data
grid_search.fit(train_features, train_labels)

# Get the best model and its hyperparameters
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

# Make predictions using the best model
best_predictions = best_model.predict(test_features)

# Calculate evaluation metrics: adjusted R squared, RMSE, and MAPE
def adjusted_r_squared(y_true, y_pred, n_features):
    r_squared = 1 - (np.sum((y_true - y_pred) ** 2) / np.sum((y_true - np.mean(y_true)) ** 2))
    adjusted_r_squared = 1 - ((1 - r_squared) * (len(y_true) - 1) / (len(y_true) - n_features - 1))
    return adjusted_r_squared

def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

adj_r_squared = adjusted_r_squared(test_labels, best_predictions, len(features.columns))
rmse_value = rmse(test_labels, best_predictions)
mape_value = mean_absolute_percentage_error(test_labels, best_predictions)

# Display evaluation metrics
print("Best Model Hyperparameters:", best_params)
print("Adjusted R squared:", adj_r_squared)
print("RMSE:", rmse_value)
print("Mean Absolute Percentage Error:", mape_value)

Fitting 5 folds for each of 480 candidates, totalling 2400 fits


In [None]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error

# Assuming df contains your features and labels
features = df.drop(columns=['Label'])
labels = df['Label']

# Split the data into training and testing sets
train_features, test_features, train_labels, test_labels = train_test_split(features, labels, test_size=0.2, random_state=42)

# Define the parameter grid for hyperparameter tuning
param_grid = {
    'n_estimators': [10, 50, 100, 500],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Initialize the grid search with RandomForestRegressor
grid_search = GridSearchCV(estimator=RandomForestRegressor(random_state=42),
                           param_grid=param_grid,
                           cv=5,
                           scoring='neg_mean_squared_error',
                           verbose=2,
                           n_jobs=-1)

# Fit the grid search to the training data
grid_search.fit(train_features, train_labels)

# Get the best model and its hyperparameters
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

# Make predictions using the best model
best_predictions = best_model.predict(test_features)

# Calculate evaluation metrics: adjusted R squared, RMSE, and MAPE
def adjusted_r_squared(y_true, y_pred, n_features):
    r_squared = 1 - (np.sum((y_true - y_pred) ** 2) / np.sum((y_true - np.mean(y_true)) ** 2))
    adjusted_r_squared = 1 - ((1 - r_squared) * (len(y_true) - 1) / (len(y_true) - n_features - 1))
    return adjusted_r_squared

def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

adj_r_squared = adjusted_r_squared(test_labels, best_predictions, len(features.columns))
rmse_value = rmse(test_labels, best_predictions)
mape_value = mean_absolute_percentage_error(test_labels, best_predictions)

# Display evaluation metrics
print("Best Model Hyperparameters:", best_params)
print("Adjusted R squared:", adj_r_squared)
print("RMSE:", rmse_value)
print("Mean Absolute Percentage Error:", mape_value)

NameError: name 'df' is not defined

Machine learning model is being trained using 5-fold cross-validation with 144 different sets of hyperparameters. This means that the data is split into 5 folds, and the model is trained and evaluated 5 times, each time using a different fold as the test set and the remaining folds as the training set. This process is repeated for each of the 144 sets of hyperparameters, resulting in a total of 720 training and evaluation runs.