###Считваение файлов и создание Датафрейма

In [1]:
import pandas as pd
import numpy as np
import os


In [2]:
#подключение гугла
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
dataset_path = '/content/drive/My Drive/aclImdb_v1.tar.gz'


Считаем файлы при помощи tarfile

In [4]:
import tarfile
if not os.path.exists('/content/aclImdb/'):
  with tarfile.open(dataset_path, 'r:gz') as tar:
    tar.extractall('/content/')

In [5]:
def load_data(data_dir):
  data = {'text_rev':[], 'sentiment':[], 'estimation':[]}
  for sentiment in ['pos', 'neg']:
    sentiment_dir = os.path.join(data_dir, sentiment)
    for filename in os.listdir(sentiment_dir): #проходка по директории
      if filename.endswith('.txt'):
        with open(os.path.join(sentiment_dir, filename), 'r', encoding='utf-8') as f: #кодировка на всякий случай
          data['text_rev'].append(f.read())
          data['sentiment'].append(1 if sentiment == 'pos' else 0)
          estimate = int((filename.split('_')[1]).split(".")[0]) #разделим оценку
          data['estimation'].append(estimate)
  return pd.DataFrame(data)


In [6]:
data_dir_train = '/content/aclImdb/train'
data_dir_test = '/content/aclImdb/test'
train_df = load_data(data_dir_train)
test_df = load_data(data_dir_test)

In [None]:
train_df

Unnamed: 0,text_rev,sentiment,estimation
0,Erotic cinema of the 1970's was tame compared ...,1,7
1,"As a kid, I loved computer animation although ...",1,10
2,"After all these years, of Peter O'Tool's brill...",1,10
3,"First, this was a BRAVE film. I've seen Irreve...",1,7
4,Will all of you please lay the hell off Todd S...,1,9
...,...,...,...
24995,Absolutely the most boring movie I have ever s...,0,1
24996,This movie was probably the biggest waste of m...,0,1
24997,This should have been a movie about Sam and hi...,0,4
24998,As myself and my other half are big fans of tr...,0,1


In [9]:
example = train_df[train_df["estimation"] == 1]["text_rev"].iloc[0]
example

'After finally viewing this movie in its entirety, I am completely mystified by the adoration it has received by critics and online users alike. Is it the worst Western ever? No, I wouldn\'t say that. But "the last great American Western", a phrase I saw applied to it more than once? Not even close.<br /><br />A movie that tries to tell a story like this needs believable characters that speak believable dialogue, and the dialogue in this film is among the most hackneyed and clichéd that I\'ve ever seen. The movie can be measured in groaners per minute; as in, how many times is an actor or actress forced by the script to say something that no human being would say in real life? There\'s so many instances of this that it\'s distracting. Cheesy lines come at you in waves; predictable, unoriginal, and often. <br /><br />If bad dialogue doesn\'t bother you, then how about bad gunfights? Few Westerns can show you gunfighting that\'s completely unbelievable while desperately trying to make yo

Отзыв отрицательный, обработано верно переходим к обработке данных

In [None]:
train_df["estimation"].value_counts()

Unnamed: 0_level_0,count
estimation,Unnamed: 1_level_1
1,5100
10,4732
8,3009
4,2696
7,2496
3,2420
2,2284
9,2263


###Подготовка данных(токенизация)

In [7]:
import nltk
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import SnowballStemmer
import string
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
my_list = string.punctuation
my_list

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [10]:
#Разобъем на таконы
my_list = string.punctuation
cleaned_text = re.sub(r"[^\w\s]", "", example)
string_to_token = word_tokenize(cleaned_text, language="english")
token_without_punkt = [i for i in string_to_token if i not in string.punctuation]
token_withut_punkt_and_stop_word = [i for i in token_without_punkt if i not in stopwords.words("english")]
stemer = SnowballStemmer(language="english")
result = [stemer.stem(i) for i in token_withut_punkt_and_stop_word]
result[:5]

['after', 'final', 'view', 'movi', 'entireti']

In [11]:
def preprocess_text(text: str, language: str = "english") -> list:
    cleaned_text = re.sub(r"[^\w\s]", "", text)

    tokens = word_tokenize(cleaned_text, language=language)

    stop_words = set(stopwords.words(language))
    tokens_without_stopwords = [token for token in tokens if token.lower() not in stop_words]

    stemmer = SnowballStemmer(language=language)
    stemmed_tokens = [stemmer.stem(token) for token in tokens_without_stopwords]

    return stemmed_tokens

###Обучение модели

In [12]:
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression


In [13]:
train_df

Unnamed: 0,text_rev,sentiment,estimation
0,Tressa's vocal performance was Outstanding!! T...,1,10
1,"Well, when before I saw this film I really was...",1,10
2,"I remember this movie from when i was 12, it w...",1,10
3,"This is one of the best reunion specials ever,...",1,10
4,This made for television version of the legend...,1,7
...,...,...,...
24995,"The only reason ""The Norliss Tapes"" deserves A...",0,1
24996,I haven't seen it in over twenty years. OJ was...,0,1
24997,"...was so that I could, in good conscience, te...",0,3
24998,This movie kinda let me down. It seemed a lot ...,0,4


In [14]:
Tfvector = TfidfVectorizer(tokenizer= lambda x: preprocess_text(x, language='english'))

In [16]:
features = Tfvector.fit_transform(train_df["text_rev"])

In [None]:
model = LogisticRegression(random_state=42)
model.fit(features, train_df["estimation"])


In [None]:
from sklearn.metrics import classification_report

In [None]:
test_data = Tfvector.transform(test_df["text_rev"])
y_pred = model.predict(test_data)
print(classification_report(test_df["estimation"], y_pred))

              precision    recall  f1-score   support

           1       0.53      0.82      0.64      5022
           2       0.28      0.08      0.13      2302
           3       0.32      0.16      0.22      2541
           4       0.35      0.35      0.35      2635
           7       0.36      0.27      0.31      2307
           8       0.30      0.26      0.28      2850
           9       0.31      0.08      0.13      2344
          10       0.49      0.76      0.60      4999

    accuracy                           0.44     25000
   macro avg       0.37      0.35      0.33     25000
weighted avg       0.40      0.44      0.39     25000



In [None]:
#Порпробуем решить задачу градиентым бустингом

In [None]:
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder
Tfvector = TfidfVectorizer(tokenizer=lambda x: preprocess_text(x, language='english'))
features = Tfvector.fit_transform(train_df["text_rev"])
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(train_df["estimation"])
X_train, y_train = features, y_encoded
# Инициализация модели XGBoost
model = xgb.XGBClassifier(
    objective='multi:softmax',
    num_class=8,
    eval_metric='mlogloss',
    use_label_encoder=False)

param_grid = {
    'learning_rate': [0.01, 0.05, 0.1],
    'n_estimators': [100, 500, 1000],
    'max_depth': [3, 5, 7]
}


grid_search = GridSearchCV(estimator=model, param_grid=param_grid,
                           scoring='neg_log_loss', cv=3, verbose=2, n_jobs=-1)

grid_search.fit(X_train, y_train)
best_params = grid_search.best_params_
print(f"Best Parameters: {best_params}")


best_model = grid_search.best_estimator_
y_pred = best_model.predict(test_data)
y_encoded_test = label_encoder.transform(test_df["estimation"])
print(classification_report(y_encoded_test, y_pred))




Fitting 3 folds for each of 27 candidates, totalling 81 fits


KeyboardInterrupt: 

Модель очень сложная и бесплантой версии коллаб не хватило ее обучить(пробовал даже без grid_search)

In [17]:
combined_df = pd.concat([train_df, test_df], ignore_index=True)
combined_df

Unnamed: 0,text_rev,sentiment,estimation
0,Tressa's vocal performance was Outstanding!! T...,1,10
1,"Well, when before I saw this film I really was...",1,10
2,"I remember this movie from when i was 12, it w...",1,10
3,"This is one of the best reunion specials ever,...",1,10
4,This made for television version of the legend...,1,7
...,...,...,...
49995,Jeff Speakman never really made it beyond the ...,0,4
49996,What a terrible movie! It represents perfectly...,0,1
49997,I only wish there was a grade lower than F to ...,0,1
49998,"This movie is a real low budget production, ye...",0,3


In [None]:
pip install --upgrade xgboost scikit-learn



In [None]:

model = xgb.XGBClassifier(
    random_state=42,
    learning_rate=0.1,
    n_estimators=1000,
    objective='multi:softmax',
    num_class=10,
    eval_metric='mlogloss',
    use_label_encoder=False
)


model.fit(
    features_train, y_encoded_train,
    verbose=2
)


y_pred = model.predict(features_test)


print(classification_report(y_encoded_test, y_pred))


Parameters: { "use_label_encoder" } are not used.



Для модели выше нехватило времени ресурсы коллаба заканчиваются

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
X_train, X_test, y_train, y_test = train_test_split(combined_df["text_rev"], combined_df["estimation"], test_size=0.2, random_state=42)


Tfvector = TfidfVectorizer(tokenizer=lambda x: preprocess_text(x, language='english'))
features_train = Tfvector.fit_transform(X_train)
features_test = Tfvector.transform(X_test)


label_encoder = LabelEncoder()
y_encoded_train = label_encoder.fit_transform(y_train)
y_encoded_test = label_encoder.transform(y_test)

model = LogisticRegression(random_state=42)
param_grid = {'C': [0.1, 1, 10],
              'penalty': ['l1', 'l2'],
              'solver': ['liblinear', 'saga']}
grid_search = GridSearchCV(model, param_grid, cv=5, scoring='accuracy')
grid_search.fit(features_train, y_encoded_train)


y_pred = grid_search.best_estimator_.predict(features_test)
print(classification_report(y_encoded_test, y_pred))



              precision    recall  f1-score   support

           0       0.55      0.82      0.66      2007
           1       0.35      0.10      0.16       880
           2       0.39      0.21      0.27       984
           3       0.41      0.36      0.39      1074
           4       0.38      0.30      0.33       954
           5       0.34      0.26      0.30      1193
           6       0.39      0.13      0.19       909
           7       0.49      0.80      0.60      1999

    accuracy                           0.46     10000
   macro avg       0.41      0.37      0.36     10000
weighted avg       0.43      0.46      0.42     10000



In [None]:
model = grid_search.best_estimator_


In [None]:
model

In [None]:
from joblib import dump, load
filename = 'trained_model.joblib'
dump(grid_search.best_estimator_, filename)
# Загрузка сохраненной модели для использования
loaded_model = load(filename)


Провекра модели

In [None]:
text_1= '“Don’t waste your time! This movie is terrible. The plot is predictable, the dialogue is lifeless, and the actors act like they’re being forced to. The special effects are cheap, the music is annoying. The only plus is the length, it’s too short to be completely disappointed. I’d rather rewatch an old movie than this “masterpiece”!”'

In [None]:
text_2 = '“This movie is simply amazing! The plot grabs you from the first minute and doesn’t let go until the very end. The acting is top-notch, each character is alive and memorable. The visuals are impressive, and the music perfectly complements the atmosphere of the film. I would recommend this film to anyone who enjoys intelligent films with deep meaning. I will definitely rewatch it!”'

In [None]:
def pred(text):
  text_Tf = Tfvector.transform([text])
  return label_encoder.inverse_transform(model.predict(text_Tf))

In [None]:
print(pred(text_2))

[10]


Посмотрим нзависиит ли длинна тектса от целевой переменной

In [None]:
combined_df["len"] = combined_df["text_rev"].apply(lambda x: len(x))

In [None]:
combined_df.drop("text_rev", axis=1).corr()

Unnamed: 0,sentiment,estimation,len
sentiment,1.0,0.941534,0.015526
estimation,0.941534,1.0,0.01326
len,0.015526,0.01326,1.0


Нет смысла добавлять длинну текста

Сохраним модель

In [19]:
import joblib
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import SnowballStemmer
import nltk
from sklearn.preprocessing import LabelEncoder
nltk.download('stopwords', quiet=True)
nltk.download('punkt', quiet=True)

def preprocess_text(text: str) -> str:
    cleaned_text = re.sub(r"[^\w\s]", "", text)
    tokens = word_tokenize(cleaned_text)
    stop_words = set(stopwords.words("english"))
    tokens_without_stopwords = [token for token in tokens if token.lower() not in stop_words]
    stemmer = SnowballStemmer(language="english")
    stemmed_tokens = [stemmer.stem(token) for token in tokens_without_stopwords]
    return ' '.join(stemmed_tokens)


X_train, X_test, y_train, y_test = train_test_split(combined_df["text_rev"], combined_df["estimation"], test_size=0.2, random_state=42)
Tfvector = TfidfVectorizer()
features_train = Tfvector.fit_transform(X_train.apply(preprocess_text))
features_test = Tfvector.transform(X_test.apply(preprocess_text))
label_encoder = LabelEncoder()
y_encoded_train = label_encoder.fit_transform(y_train)
y_encoded_test = label_encoder.transform(y_test)

model = LogisticRegression(C=1, penalty='l1', random_state=42, solver='liblinear')
model.fit(features_train, y_encoded_train)

# Сохранение модели и объектов
joblib.dump(model, 'model_status.pkl')
joblib.dump(Tfvector, 'tfidf_vectorizer.pkl')
joblib.dump(label_encoder, 'label_encoder.pkl')


['label_encoder.pkl']

In [None]:
y_pred_continuous = model.predict(features_test)
y_pred_labels = label_encoder.inverse_transform(y_pred_continuous)
y_test_labels = label_encoder.inverse_transform(y_encoded_test)

In [None]:
y_pred_labels

array([10, 10,  4, ...,  4,  1, 10])

In [None]:
rew_1 = input()

"The Shawshank Redemption" is a masterpiece that transcends the typical boundaries of storytelling and film-making. Directed by Frank Darabont and based on Stephen King’s novella, this film explores themes of hope, friendship, and resilience within the confines of a bleak prison environment. The story follows Andy Dufresne, portrayed brilliantly by Tim Robbins, a banker wrongfully convicted of murder. Robbins delivers a nuanced performance, capturing Andy’s intelligence, determination, and unyielding spirit. Morgan Freeman's portrayal of Ellis "Red" Redding is equally remarkable; his narration adds depth to the film and provides a profound perspective on life behind bars. The chemistry between Andy and Red is palpable, evolving from strangers to deep friends over the years. Visually, the film is stunning, with cinematography that beautifully captures the harsh realities of prison life juxtaposed with moments of serene beauty. Thomas Newman’s score perfectly complements the emotional la

In [None]:
print(label_encoder.inverse_transform(model.predict(Tfvector.transform([preprocess_text(input())])))[0])

"Journey to the Stars" is a breathtaking cinematic experience that takes you on an unforgettable adventure through the cosmos. From the moment the film begins, you are immersed in a stunning visual spectacle, with beautifully rendered space scenes that leave you in awe. The special effects are nothing short of spectacular, creating a sense of wonder that truly captures the vastness of the universe.  The storyline is both engaging and thought-provoking, following the journey of a diverse crew of astronauts as they explore uncharted worlds. The characters are well-developed, each with their own unique backstories and motivations, which adds depth to the narrative. The performances are exceptional, particularly from the lead actor, who brings a powerful emotional resonance to the role, making you genuinely care about their journey.  The film also masterfully blends action, humor, and poignant moments, creating a perfect balance that keeps you entertained from start to finish. The soundtra

In [None]:
Eternal Boredom" is a cinematic disaster that feels like a punishment to sit through. The plot is a confusing mess with no direction, making it impossible to follow. It’s as if the script was written during a nap, filled with long, tedious scenes that drag on forever.

The acting is equally atrocious. The lead actor delivers every line with the enthusiasm of a sloth on sedatives. The supporting cast seems to be competing for the award of “Most Wooden Performance.” There’s no chemistry between the characters, making every interaction painfully awkward.

Visually, the film is as appealing as a gray wall. The cinematography is uninspired, and the special effects look like they were created in a middle school art class. The soundtrack is just a repetitive drone that adds to the overall feeling of despair.

In short, "Eternal Boredom" is a complete waste of time. It’s hard to believe this film even got made. Save yourself the agony and avoid it at all costs!



In [None]:
"Journey to the Stars" is a breathtaking cinematic experience that takes you on an unforgettable adventure through the cosmos. From the moment the film begins, you are immersed in a stunning visual spectacle, with beautifully rendered space scenes that leave you in awe. The special effects are nothing short of spectacular, creating a sense of wonder that truly captures the vastness of the universe.

The storyline is both engaging and thought-provoking, following the journey of a diverse crew of astronauts as they explore uncharted worlds. The characters are well-developed, each with their own unique backstories and motivations, which adds depth to the narrative. The performances are exceptional, particularly from the lead actor, who brings a powerful emotional resonance to the role, making you genuinely care about their journey.

The film also masterfully blends action, humor, and poignant moments, creating a perfect balance that keeps you entertained from start to finish. The soundtrack is beautifully composed, enhancing the emotional impact of key scenes and perfectly complementing the visuals.

Overall, "Journey to the Stars" is a remarkable film that succeeds in both storytelling and visual artistry. It's a must-see for anyone who loves science fiction or simply enjoys a well-crafted film. This movie will resonate with you long after the credits roll, and it’s easily one of the best films of the year!

In [None]:
"The Forgotten Realm" is a film that leaves you with conflicting emotions. On one hand, it offers a visually stunning adventure that transports you to a richly imagined fantasy world filled with vibrant landscapes and intricate designs. The cinematography is top-notch, with sweeping shots that capture the beauty and majesty of this forgotten realm.  However, while the visuals are a highlight, the storyline feels underwhelming. The plot has potential, revolving around a young hero’s quest to reclaim a lost kingdom, but it often falls into predictable tropes that leave little room for surprises. Some characters are well-developed and engaging, especially the mentor figure who brings wisdom and humor to the journey. Yet, other characters come off as clichéd and lacking depth, which makes it hard to connect with their struggles.  The pacing of the film is another mixed bag. Some scenes drag on unnecessarily, while others feel rushed, especially the climactic battle that could have benefited from more build-up. The dialogue has its moments of cleverness, but it can also be cheesy, pulling you out of the immersive experience.  Overall, "The Forgotten Realm" is a visual feast that showcases the power of fantasy filmmaking, but it struggles with its narrative execution. Fans of the genre may find it enjoyable for its aesthetics, but those looking for a compelling story may walk away feeling a bit disappointed. It’s a film that, while beautiful to watch, could have benefited from a stronger script to match its impressive visuals.


In [None]:
print(classification_report(y_test_labels, y_pred_labels))

              precision    recall  f1-score   support

           1       0.53      0.83      0.65      2018
           2       0.36      0.09      0.14       911
           3       0.37      0.18      0.24       992
           4       0.35      0.34      0.34      1024
           7       0.37      0.30      0.33       943
           8       0.33      0.27      0.30      1188
           9       0.37      0.09      0.15       958
          10       0.48      0.79      0.60      1966

    accuracy                           0.45     10000
   macro avg       0.40      0.36      0.34     10000
weighted avg       0.42      0.45      0.40     10000



In [None]:
Tfvector

In [21]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import classification_report
#Линейная регрессия
linear_model = LinearRegression()
linear_model.fit(features_train, y_encoded_train)
y_pred_continuous = linear_model.predict(features_test)
y_pred_classes = np.rint(y_pred_continuous).astype(int)
y_pred_classes = np.clip(y_pred_classes, 0, len(label_encoder.classes_) - 1)
report = classification_report(y_encoded_test, y_pred_classes)
print(report)

              precision    recall  f1-score   support

           0       0.45      0.40      0.42      1995
           1       0.16      0.16      0.16       909
           2       0.13      0.15      0.14       950
           3       0.16      0.17      0.17      1091
           4       0.11      0.14      0.12       984
           5       0.16      0.16      0.16      1147
           6       0.15      0.15      0.15       970
           7       0.41      0.37      0.39      1954

    accuracy                           0.25     10000
   macro avg       0.22      0.21      0.21     10000
weighted avg       0.26      0.25      0.25     10000



In [None]:
y_encoded_test

array([7, 7, 7, ..., 3, 1, 7])

In [None]:
y_pred_classes

array([7, 7, 2, ..., 3, 0, 2])

In [None]:
report = classification_report(y_encoded_test, y_pred_classes)
print(report)

              precision    recall  f1-score   support

           0       0.46      0.39      0.42      2018
           1       0.18      0.17      0.17       911
           2       0.15      0.16      0.15       992
           3       0.14      0.16      0.15      1024
           4       0.13      0.17      0.14       943
           5       0.17      0.16      0.17      1188
           6       0.16      0.16      0.16       958
           7       0.40      0.36      0.38      1966

    accuracy                           0.25     10000
   macro avg       0.22      0.22      0.22     10000
weighted avg       0.26      0.25      0.25     10000



In [None]:
from sklearn.ensemble import RandomForestClassifier
# Обучение модели случайного леса
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(features_train, y_encoded_train)
y_pred_classes = rf_model.predict(features_test)
y_pred_labels = label_encoder.inverse_transform(y_pred_classes)
y_test_labels = label_encoder.inverse_transform(y_encoded_test)
report = classification_report(y_test_labels, y_pred_labels)
print(report)


              precision    recall  f1-score   support

           1       0.40      0.89      0.55      2018
           2       0.67      0.03      0.05       911
           3       0.51      0.04      0.08       992
           4       0.42      0.10      0.16      1024
           7       0.41      0.08      0.13       943
           8       0.34      0.11      0.17      1188
           9       0.51      0.02      0.04       958
          10       0.37      0.85      0.52      1966

    accuracy                           0.39     10000
   macro avg       0.45      0.27      0.21     10000
weighted avg       0.44      0.39      0.28     10000



Среди всех моделей выше, лучше всего показала лог.регрессия ее и выберем для для работы(она была предварительно сохранена)