# Проект для «Викишоп»

Цель - обучить модель классифицировать комментарии на позитивные и негативные. 
Построить модель со значением метрики качества *F1* не меньше 0.75. 


## Подготовка

In [None]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords as nltk_stopwords
from sklearn.feature_extraction.text import TfidfVectorizer 
from nltk.stem import WordNetLemmatizer 
import re
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords as nltk_stopwords
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
import warnings
warnings.filterwarnings('ignore')

In [None]:
data = pd.read_csv('/datasets/toxic_comments.csv')
data.head()

Unnamed: 0,text,toxic
0,Explanation\nWhy the edits made under my usern...,0
1,D'aww! He matches this background colour I'm s...,0
2,"Hey man, I'm really not trying to edit war. It...",0
3,"""\nMore\nI can't make any real suggestions on ...",0
4,"You, sir, are my hero. Any chance you remember...",0


In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 159571 entries, 0 to 159570
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   text    159571 non-null  object
 1   toxic   159571 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 2.4+ MB


In [None]:
m = WordNetLemmatizer()

def lemmatize_text(text):
    lem_list = nltk.word_tokenize(text)
    return ' '.join([m.lemmatize(w) for w in lem_list])

def clear_text(text):
    clear_text = re.sub(r"[^a-zA-Z']", ' ', text)
    clear_text = clear_text.split()
    clear_text = " ".join(clear_text)
    return clear_text


In [None]:
data['text'] = data['text'].apply(clear_text)
data['lemm_text'] = data['text'].apply(lemmatize_text)
data.head()

Unnamed: 0,text,toxic,lemm_text
0,Explanation Why the edits made under my userna...,0,Explanation Why the edits made under my userna...
1,D'aww He matches this background colour I'm se...,0,D'aww He match this background colour I 'm see...
2,Hey man I'm really not trying to edit war It's...,0,Hey man I 'm really not trying to edit war It ...
3,More I can't make any real suggestions on impr...,0,More I ca n't make any real suggestion on impr...
4,You sir are my hero Any chance you remember wh...,0,You sir are my hero Any chance you remember wh...


In [None]:
data = data.drop(['text'], axis=1)

In [None]:
features = data['lemm_text'].values
target = data['toxic'].values

In [None]:
features_train, features_test, target_train, target_test = train_test_split(features, target, test_size=0.2, random_state=12345)

In [None]:
stopwords = set(nltk_stopwords.words('english'))

count_tf_idf = TfidfVectorizer(stop_words = stopwords)

In [None]:
features_train = count_tf_idf.fit_transform(features_train)
features_test = count_tf_idf.transform(features_test)

## Обучение

In [None]:
def model_fit_predict(model, features_train=features_train, target_train=target_train, 
                         target_test=target_test, features_test=features_test):
    model.fit(features_train, target_train)
    return print("F score: ", f1_score(target_test, model.predict(features_test)))

In [None]:
print(features_train.shape, features_test.shape, target_train.shape, target_test.shape)

(127656, 144122) (31915, 144122) (127656,) (31915,)


##### LogisticRegression

In [None]:
model = LogisticRegression()

model_fit_predict(model)

F score:  0.7339791356184798


##### RandomForest

In [None]:
model = RandomForestClassifier()

model_fit_predict(model)

F score:  0.6999615827890895


Попробуем улучшить LogisticRegression с помощью перебора параметров

In [None]:
lr_model = LogisticRegression(random_state=12345, solver='liblinear', max_iter=100)
params = {'penalty':['l1', 'l2'], 'C':list(range(1,15,3))}

lr_grid = GridSearchCV(lr_model, params, cv=3, scoring='f1', verbose=True).fit(features_train, target_train)

print ("Best Params", lr_grid.best_params_)
print ("Best Score", lr_grid.best_score_)

Fitting 3 folds for each of 10 candidates, totalling 30 fits
Best Params {'C': 4, 'penalty': 'l1'}
Best Score 0.7735164543698501


In [None]:
lr_best_param = LogisticRegression(random_state=12345, C = 4, penalty = 'l1', solver='liblinear', max_iter=100)

model_fit_predict(lr_best_param)


F score:  0.7595838677497506


## Выводы

Лучший результат у модели LogisticRegression. После перебора параметров F score получился 0.7735.
На тестовой выборке 0.7596.