In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score
from sklearn.metrics import classification_report
import scipy as sp

In [2]:
df = pd.read_csv('Task_1_prepprocessed.csv')
df = df.dropna()
df

Unnamed: 0,class,date,from,to,subject,body
0,0,4,info@global-change.com,michelle.lokay@enron.com,next wave energi trade,energi industri profession global chang associ...
1,0,1,info@pmaconference.com,michelle.lokay@enron.com,regist next txu capac auction,regist next txu energi capac auction new regis...
2,0,6,info@pmaconference.com,michelle.lokay@enron.com,merchant power monthli free sampl,merchant power monthli month s issu almost mw ...
3,0,3,bruno@firstconf.com,energynews@fc.ease.lsoft.com,eyeforenergi updat,welcom week s eyeforenergi updat refresh memor...
4,0,1,deanrogers@energyclasses.com,michelle.lokay@enron.com,deriv earli bird til march houston,deriv energi profession two full day april ear...
...,...,...,...,...,...,...
30687,1,3,jacob rzucidlo <lavoneaker@stalag13.com>,johnny wynott <varou@iit.demokritos.gr>,cpu pain m edicati n ship d r,arrghh west amnstv amlsmith basu petrom qureai...
30688,1,5,hal leake <annettgaskell@buglover.net>,renato mooney <sigletos@iit.demokritos.gr>,dn troubl f r ee,dn troubl f r ee angiospasma zekauskasa anarti...
30689,1,2,dr collins khumalo <khumalo_20@sunumail.sn>,khumalo_20@sunumail.sn,dr collin khumalo,dr collin khumalo attn mr presid dr collin khu...
30690,1,6,Customer Support <support@citibank.com>,Paliourg <paliourg@iit.demokritos.gr>,dear custom detail compromis,dear custom detail compromis dear custom recen...


In [10]:
x_train, x_test, y_train, y_test = train_test_split(df.iloc[:, 5], df.iloc[:,0], test_size=0.25, random_state=100)

## Конкатация колонок subject - body

In [6]:
df_copy = df.copy(deep = True)

for i in range(len(df_copy)):
    if df.iat[i, 4] != '':
        try:
            df_copy.iat[i, 5] = df_copy.iat[i, 4] + ' ' + df_copy.iat[i, 5]
        except:
            print("Something got wrong!")
            
x_train_b, x_test_b, _, _ = train_test_split(
    df_copy.iloc[:, [1, 5]], df_copy.iloc[:,0], test_size=0.25, random_state=100)

x_train_b

Unnamed: 0,date,body
216,4,fw california capac report week transwestern s...
11901,0,child center tour thank thank much take time b...
21659,2,book friend talk get danger book world book a ...
2775,2,mobil beaumont mar rebecca mobil march begin m...
8057,0,fyi discu neg ctc cpuc cdwr cent surcharg pass...
...,...,...
12272,3,notic probat would like pa sharad s probationa...
25015,0,healthi live everyday life improv sexual life ...
14335,0,sevil yamen ann thank vinc ann labbe enron enr...
23872,1,offic softwar wholesal price get popular softw...


# Метод 1

Базовый алгоритм, использующий в качестве признака только body

In [11]:
vectorizer_a = TfidfVectorizer() # Токенизация по TF-IDF 

x_train_a = vectorizer_a.fit_transform(x_train)
x_test_a = vectorizer_a.transform(x_test)

In [12]:
rfc_a = RandomForestClassifier(random_state=2000, n_jobs=10, n_estimators=10) # Случайный лес - классификация
rfc_a.fit(x_train_a, y_train)
y_pred_a = rfc_a.predict(x_test_a)
print(classification_report(y_test, y_pred_a, digits=3))

              precision    recall  f1-score   support

           0      0.947     0.989     0.968      3962
           1      0.987     0.938     0.962      3511

    accuracy                          0.965      7473
   macro avg      0.967     0.964     0.965      7473
weighted avg      0.966     0.965     0.965      7473



# Метод 2

Признаки: body, subjects, день недели

In [13]:
vectorizer_b = TfidfVectorizer()
# Добавляем к полученной в результате векторизации разреженной матрице столбец со значениями дней недели
x_train_b = sp.sparse.hstack((vectorizer_b.fit_transform(x_train_b.iloc[:, 1]), x_train_b.iloc[:, 0].values.reshape(len(x_train_b.iloc[:, 0]),1)))
x_test_b = sp.sparse.hstack((vectorizer_b.transform(x_test_b.iloc[:, 1]), x_test_b.iloc[:, 0].values.reshape(len(x_test_b.iloc[:, 0]),1)))

In [14]:
rfc_b = RandomForestClassifier(random_state=2000, n_jobs=10, n_estimators=10)
rfc_b.fit(x_train_b, y_train)
y_pred_b = rfc_b.predict(x_test_b)
print(classification_report(y_test, y_pred_b, digits=3))

              precision    recall  f1-score   support

           0      0.957     0.987     0.972      3962
           1      0.985     0.950     0.967      3511

    accuracy                          0.970      7473
   macro avg      0.971     0.969     0.970      7473
weighted avg      0.970     0.970     0.970      7473



# Метод 3

ngram_range=(2, 2)

In [15]:
vectorizer_c = TfidfVectorizer(ngram_range = (2, 2))

x_train_c = vectorizer_c.fit_transform(x_train)
x_test_c = vectorizer_c.transform(x_test)

rfc_c = RandomForestClassifier(random_state=2000, n_jobs=10, n_estimators=10)
rfc_c.fit(x_train_c, y_train)
y_pred_c = rfc_c.predict(x_test_c)
print(classification_report(y_test, y_pred_c, digits=3))

              precision    recall  f1-score   support

           0      0.968     0.959     0.964      3962
           1      0.954     0.964     0.959      3511

    accuracy                          0.962      7473
   macro avg      0.961     0.962     0.961      7473
weighted avg      0.962     0.962     0.962      7473

