In [52]:
import os
import pandas as pd

In [53]:
def load_data(direc, label):
    texts = []
    for i in os.listdir(direc):
        file_path = os.path.join(direc, i)
        with open(file_path, 'r', encoding= 'UTF-8') as j:
            texts.append(j.read())

    return pd.DataFrame({'review': texts, 'sentiment': label})

train_pos_path = "aclImdb/train/pos"
train_neg_path = "aclImdb/train/neg"
test_pos_path = "aclImdb/test/pos"
test_neg_path = "aclImdb/test/neg"

train_pos = load_data(train_pos_path, "positive")
train_neg = load_data(train_neg_path, "negative")
test_pos = load_data(test_pos_path, "positive")
test_neg = load_data(test_neg_path, "negative")

train_set = pd.concat([test_pos, test_neg], ignore_index=True)
test_set = pd.concat([test_pos, test_neg], ignore_index=True)



In [11]:
train_set

Unnamed: 0,review,sentiment
0,I went and saw this movie last night after bei...,positive
1,Actor turned director Bill Paxton follows up h...,positive
2,As a recreational golfer with some knowledge o...,positive
3,"I saw this film in a sneak preview, and it is ...",positive
4,Bill Paxton has taken the true story of the 19...,positive
...,...,...
24995,I occasionally let my kids watch this garbage ...,negative
24996,When all we have anymore is pretty much realit...,negative
24997,The basic genre is a thriller intercut with an...,negative
24998,Four things intrigued me as to this film - fir...,negative


In [10]:
train_set.to_csv('train_set')
test_set.to_csv('test_set')

In [1]:
import pandas as pd

In [2]:
train_set = pd.read_csv('train_set', index_col=0)
test_set = pd.read_csv('test_set', index_col=0)

In [3]:
train_set

Unnamed: 0,review,sentiment
0,I went and saw this movie last night after bei...,positive
1,Actor turned director Bill Paxton follows up h...,positive
2,As a recreational golfer with some knowledge o...,positive
3,"I saw this film in a sneak preview, and it is ...",positive
4,Bill Paxton has taken the true story of the 19...,positive
...,...,...
24995,I occasionally let my kids watch this garbage ...,negative
24996,When all we have anymore is pretty much realit...,negative
24997,The basic genre is a thriller intercut with an...,negative
24998,Four things intrigued me as to this film - fir...,negative


In [4]:
train_set = train_set.sample(frac=1, random_state=42).reset_index(drop=True)
test_set = test_set.sample(frac=1, random_state=42).reset_index(drop=True)
#randomized

In [5]:
train_set

Unnamed: 0,review,sentiment
0,"When I was a kid, I loved ""Tiny Toons"". I espe...",positive
1,"The setup for ""Nature of the Beast"" is ingenio...",negative
2,I do not have much to say than this is a great...,positive
3,Extremely formulaic with cosmic-sized logic ho...,negative
4,I actually liked certain things about this gam...,negative
...,...,...
24995,Start with the premise that you will do anythi...,negative
24996,This movie gives us some WWII history along wi...,positive
24997,In my opinion this is the best Oliver Stone fl...,positive
24998,"It's certainly a direct-to-video, but the stor...",negative


In [6]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

#nltk.download('punkt')
#nltk.download('stopwords')
#nltk.download('wordnet')

In [7]:
def remove_stopwords(text):
    stop_words = set(stopwords.words('english'))  
    word_tokens = word_tokenize(text)  #tokenize
    filtered_text = [i for i in word_tokens if i.lower() not in stop_words]  #dekete the stop words
    return ' '.join(filtered_text)  

train_set['review'] = train_set['review'].apply(remove_stopwords)
test_set['review'] = test_set['review'].apply(remove_stopwords)


In [8]:
train_set

Unnamed: 0,review,sentiment
0,"kid , loved `` Tiny Toons '' . especially love...",positive
1,"setup `` Nature Beast '' ingeniously simple , ...",negative
2,much say great finish story . people said enou...,positive
3,Extremely formulaic cosmic-sized logic holes p...,negative
4,actually liked certain things game . loved fir...,negative
...,...,...
24995,Start premise anything replace lost love look-...,negative
24996,movie gives us WWII history along touching rom...,positive
24997,opinion best Oliver Stone flick -- probably Bo...,positive
24998,"'s certainly direct-to-video , story bad revie...",negative


In [9]:
train_set['review'][0]

"kid , loved `` Tiny Toons '' . especially loved `` Tiny Toons : spent Summer Vacation '' . thought laughs floor funny . years later , friend video . figured 'd watch good old days . still floors laughing . opinion , Plucky Hampton skit best . decide go `` Happy World Land '' . end crazy adventure get . skits funny . 'm still looking video . , anyone tips . Please write . < br / > < br / > one funniest cartoons ever seen. < br / > < br / > 10/10"

In [10]:
import re

def clean_text(text):
    text = re.sub(r'<.*?>', '', text) 
    text = re.sub(r"[^\w\s]", '', text)
    text = text.lower()
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text

train_set['review'] = train_set['review'].apply(clean_text)
test_set['review'] = test_set['review'].apply(clean_text)



In [11]:
train_set['review'][0]

'kid loved tiny toons especially loved tiny toons spent summer vacation thought laughs floor funny years later friend video figured d watch good old days still floors laughing opinion plucky hampton skit best decide go happy world land end crazy adventure get skits funny m still looking video anyone tips please write one funniest cartoons ever seen 1010'

In [12]:
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
lemma = WordNetLemmatizer()

def lemmatize(text):
    tokens = word_tokenize(text)
    lemmatization = [lemma.lemmatize(i) for i in tokens]
    return ' '.join(lemmatization)

train_set['review'] = train_set['review'].apply(lemmatize)
test_set['review'] = test_set['review'].apply(lemmatize)

In [13]:
train_set['review'][0]

'kid loved tiny toon especially loved tiny toon spent summer vacation thought laugh floor funny year later friend video figured d watch good old day still floor laughing opinion plucky hampton skit best decide go happy world land end crazy adventure get skit funny m still looking video anyone tip please write one funniest cartoon ever seen 1010'

In [14]:
y = train_set['sentiment']

In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)

X_train_tfidf = vectorizer.fit_transform(train_set['review'])

X_test_tfidf = vectorizer.transform(test_set['review'])


In [16]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
X_train, X_valid, y_train, y_valid = train_test_split(X_train_tfidf, y, test_size=0.3, random_state=42)

model = MultinomialNB()   #Naive Bayes classifier
model.fit(X_train, y_train)

In [17]:
from sklearn.metrics import accuracy_score, classification_report
y_pred = model.predict(X_valid)

print(f"Accuracy: {accuracy_score(y_valid, y_pred)}")
print("Classification Report:")
print(classification_report(y_valid, y_pred))


Accuracy: 0.8608
Classification Report:
              precision    recall  f1-score   support

    negative       0.85      0.87      0.86      3745
    positive       0.87      0.85      0.86      3755

    accuracy                           0.86      7500
   macro avg       0.86      0.86      0.86      7500
weighted avg       0.86      0.86      0.86      7500



In [18]:
test_y = test_set['sentiment']

In [19]:
from sklearn.metrics import accuracy_score, classification_report

y_pred_test = model.predict(X_test_tfidf)  

accuracy_test = accuracy_score(test_y, y_pred_test)  
print(f"Accuracy on Test Set: {accuracy_test}")

print("Classification Report on Test Set:")
print(classification_report(test_y, y_pred_test))


Accuracy on Test Set: 0.87424
Classification Report on Test Set:
              precision    recall  f1-score   support

    negative       0.87      0.87      0.87     12500
    positive       0.87      0.87      0.87     12500

    accuracy                           0.87     25000
   macro avg       0.87      0.87      0.87     25000
weighted avg       0.87      0.87      0.87     25000



In [20]:
from sklearn.linear_model import LogisticRegression

lr_model = LogisticRegression(max_iter=1000)
lr_model.fit(X_train_tfidf, train_set['sentiment'])

y_pred_lr = lr_model.predict(X_test_tfidf)

accuracy_lr = accuracy_score(test_y, y_pred_lr)
report_lr = classification_report(test_y, y_pred_lr)

print(f'Accuracy with Logistic Regression: {accuracy_lr}')
print('Classification Report with Logistic Regression:')
print(report_lr)


Accuracy with Logistic Regression: 0.92056
Classification Report with Logistic Regression:
              precision    recall  f1-score   support

    negative       0.93      0.91      0.92     12500
    positive       0.91      0.93      0.92     12500

    accuracy                           0.92     25000
   macro avg       0.92      0.92      0.92     25000
weighted avg       0.92      0.92      0.92     25000



In [21]:
# import pandas as pd
# import torch
# from sklearn.model_selection import train_test_split
# from transformers import BertTokenizer

# df = train_set 

# df['sentiment'] = df['sentiment'].map({'negative': 0, 'positive': 1})

# train_texts, val_texts, train_labels, val_labels = train_test_split(
#     df['review'].tolist(), df['sentiment'].tolist(), test_size=0.2, random_state=42)

# tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=512)
# val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=512)


In [22]:
# from torch.utils.data import Dataset

# class SentimentDataset(Dataset):
#     def __init__(self, encodings, labels):
#         self.encodings = encodings
#         self.labels = labels

#     def __len__(self):
#         return len(self.labels)

#     def __getitem__(self, idx):
#         item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
#         item["labels"] = torch.tensor(self.labels[idx])
#         return item

# train_dataset = SentimentDataset(train_encodings, train_labels)
# val_dataset = SentimentDataset(val_encodings, val_labels)


In [23]:
# from transformers import BertForSequenceClassification

# model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [24]:
# from transformers import Trainer, TrainingArguments


# training_args = TrainingArguments(
#     output_dir='./results',
#     evaluation_strategy="epoch",
#     save_strategy="epoch",
#     per_device_train_batch_size=8,  
#     per_device_eval_batch_size=8,   
#     num_train_epochs=3,  # Количество эпох
#     weight_decay=0.01,  
#     logging_dir='./logs',
#     logging_steps=100,
#     load_best_model_at_end=True
# )

# # Trainer
# trainer = Trainer(
#     model=model,
#     args=training_args,
#     train_dataset=train_dataset,
#     eval_dataset=val_dataset
# )

# trainer.train()




Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 