In [48]:
import pandas as pd
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB

In [49]:
# Загрузка данных

df_subtitles = pd.read_csv("C:\\Users\\Admin\\Desktop\\datasets\\EDA_movies_subtitles.csv")
df_word_level = pd.read_csv("C:\\Users\\Admin\\Desktop\\datasets\\EDA_word_level.csv")

In [50]:
df_subtitles.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 347 entries, 0 to 346
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Movie      347 non-null    object
 1   Level      347 non-null    object
 2   Subtitles  347 non-null    object
dtypes: object(3)
memory usage: 8.3+ KB


In [51]:
df_word_level.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4943 entries, 0 to 4942
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   word    4943 non-null   object
 1   level   4943 non-null   object
dtypes: object(2)
memory usage: 77.4+ KB


In [52]:
# Приведение значения столбца "Level" к нижнему регистру

df_subtitles["level"] = df_subtitles["Level"].str.lower()

# Удаление столбца "Level"

df_subtitles = df_subtitles.drop("Level", axis=1)

df_subtitles["level"] = df_subtitles["level"].str.lower()
df_word_level["level"] = df_word_level["level"].str.lower()


# Объединение данных по столбцу "level"

df_merged = pd.merge(df_subtitles, df_word_level, on="level", how="inner")

In [53]:
df_merged.sample()

Unnamed: 0,Movie,Subtitles,level,word
178790,The.Hollow.S01E01.720p.WEB.x264-EDHD,"1 00:00:47,130 --> 00:00:47,965 Huh? 2 00:01:1...",a2,manage


In [54]:
df_merged['level'].unique()

array(['b1', 'b2', 'a2', 'c1', 'a1'], dtype=object)

In [55]:
X = df_merged['word']
y = df_merged['level']

In [56]:
# Векторизовать текст

vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(X)

In [57]:
# Разделение данных на обучение и тестирование

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [58]:
# svd = TruncatedSVD(n_components=50, random_state=42)
# X_train_svd = svd.fit_transform(X_train)
# X_test_svd = svd.transform(X_test)

In [59]:
# Обучение и оценка модели

models = {
    'Logistic Regression': LogisticRegression(max_iter=1000),
#    'Naive Bayes': GaussianNB(),
    'SVM': SVC(),
    'Random Forest': RandomForestClassifier()
}

for name, model in models.items():
    print(f'Model: {name}')
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f'Accuracy (without dimensionality reduction): {accuracy}')

    model.fit(X_train_svd, y_train)
    y_pred = model.predict(X_test_svd)
    accuracy = accuracy_score(y_test, y_pred)
    print(f'Accuracy (with TruncatedSVD): {accuracy}')

    print()

Model: Logistic Regression
Accuracy (without dimensionality reduction): 0.9997832487401334
Accuracy (with TruncatedSVD): 0.3316474902010368

Model: SVM
Accuracy (without dimensionality reduction): 0.9999096869750556
Accuracy (with TruncatedSVD): 0.809060202662428

Model: Random Forest
Accuracy (without dimensionality reduction): 0.9999096869750556
Accuracy (with TruncatedSVD): 0.9999096869750556



In [60]:
# Оценка модели

for name, model in models.items():
    print(f'Model: {name}')
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f'Accuracy: {accuracy}\n')

Model: Logistic Regression
Accuracy: 0.9997832487401334

Model: SVM
Accuracy: 0.9999096869750556

Model: Random Forest
Accuracy: 0.9999096869750556



In [65]:
import pickle

best_model_name, best_model = max(models.items(), key=lambda x: accuracy_score(y_test, x[1].predict(X_test)))
model_dump = open('C:/Users/Admin/Desktop/DS studies/Data/English_score/English_score_all_files/best_model.pkl', 'wb')
pickle.dump(best_model, model_dump)
model_dump.close()

# best_model = max(models.items(), key=lambda x: accuracy_score(y_test, x[1].predict(X_test))) # находим название модели с наилучшей точностью
# model_dump = open('C:/Users/Admin/Desktop/DS studies/Data/English_score/English_score_all_files/best_model.pkl', 'wb')
# pickle.dump(best_model[1], model_dump)
# model_dump.close()