In [1]:
import os 

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import accuracy_score



In [2]:
arc_path = "C:\\Programming\\univer\\NPL\\Lab2\\aclImdb"

pos_train_path = arc_path + "\\train\\pos"
neg_train_path = arc_path + "\\train\\neg"

pos_file_names = os.listdir(pos_train_path)
neg_file_names = os.listdir(neg_train_path)

In [3]:
bound = 30000

error_list = []

pos_list = []
for index, pos_file_name in enumerate(pos_file_names):
    with open(f"{pos_train_path}\\{pos_file_name}", "r") as file:
        try:
            pos_list.append(file.read())
        except:
            error_list.append(f"pos: {pos_file_name}")
    if index + 1 >= bound:
        break

neg_list = []
for index, neg_file_name in enumerate(neg_file_names):
    with open(f"{neg_train_path}\\{neg_file_name}", "r") as file:
        try:
            neg_list.append(file.read())
        except:
            error_list.append(f"neg: {neg_file_name}")
    if index + 1 >= bound:
        break

In [4]:
min_len = min(len(pos_list), len(neg_list))
df = pd.DataFrame.from_dict({"pos" : pos_list[:min_len], "neg": neg_list[:min_len]})

print("Head:")
print(df.head(3))

print("\nTail")
print(df.tail(3))

Head:
                                                 pos  \
0  Bromwell High is a cartoon comedy. It ran at t...   
1  Homelessness (or Houselessness as George Carli...   
2  Brilliant over-acting by Lesley Ann Warren. Be...   

                                                 neg  
0  Story of a man who has unnatural feelings for ...  
1  Airport '77 starts as a brand new luxury 747 p...  
2  This film lacked something I couldn't put my f...  

Tail
                                                     pos  \
12494  I am amazed at how this movie(and most others ...   
12495  A Christmas Together actually came before my t...   
12496  Working-class romantic drama from director Mar...   

                                                     neg  
12494  I saw 'Descent' last night at the Stockholm Fi...  
12495  Some films that you pick up for a pound turn o...  
12496  This is one of the dumbest films, I've ever se...  


In [5]:
shuffled_df = df.sample(frac=1)

print("Shuffled")
print("Head:")
print(shuffled_df.head(3))

print("\nTail")
print(shuffled_df.tail(3))

Shuffled
Head:
                                                     pos  \
12266  I thought the film could be a bit more complex...   
3025   He's stocky, sweaty, slightly cross-eyed and r...   
1088   Hey now, yours truly, TheatreX, found this whi...   

                                                     neg  
12266  I really do not have any clue as to why some p...  
3025   A thematic staple of cinema since its inceptio...  
1088   A truly frightening film. Feels as if it were ...  

Tail
                                                    pos  \
6046  Samuel Fuller brings his customary playful and...   
1160  French horror cinema has seen something of a r...   
7176  Its a shame she didn't get screen credit , she...   

                                                    neg  
6046  Although the actors were good, specially Fritz...  
1160  Shown in Australia as 'Hydrosphere', this incr...  
7176  First let me be honest. I did not watch all th...  


In [6]:
pos_docs = df['pos'].tolist() 
neg_docs = df['neg'].tolist()

docs = pos_docs + neg_docs
labels = [1] * len(pos_docs) + [0] * len(neg_docs)

In [7]:
vectorizer = TfidfVectorizer()
tf_idf_matrix = vectorizer.fit_transform(docs)

X_train, X_test, y_train, y_test = train_test_split(tf_idf_matrix, labels, test_size=0.25, random_state=42)

classifier = LogisticRegression(max_iter=1000)
classifier.fit(X_train, y_train)

y_pred = classifier.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')

cv_scores = cross_val_score(classifier, tf_idf_matrix, labels, cv=5)
print(f'Cross-validation scores: {cv_scores}')
print(f'Mean cross-validation score: {cv_scores.mean()}')


Accuracy: 0.885741718674988
Cross-validation scores: [0.87057411 0.86017203 0.85837167 0.85837167 0.8707483 ]
Mean cross-validation score: 0.8636475594438615


In [9]:
def predict_sentiment(text, classifier):
    text_tf_idf = vectorizer.transform([text])
    prediction = classifier.predict(text_tf_idf)
    return "Positive" if prediction[0] == 1 else "Negative"

new_texts = [
    "This movie was fantastic and I really enjoyed it!",
    "I didn't like this film at all. It was boring.",
    "An absolute masterpiece, highly recommend!",
    "The plot was predictable and the acting was poor.", 
]

for text in new_texts:
    sentiment = predict_sentiment(text, classifier)
    print(f'The sentiment of the provided text "{text}" is: {sentiment}')

The sentiment of the provided text "This movie was fantastic and I really enjoyed it!" is: Positive
The sentiment of the provided text "I didn't like this film at all. It was boring." is: Negative
The sentiment of the provided text "An absolute masterpiece, highly recommend!" is: Positive
The sentiment of the provided text "The plot was predictable and the acting was poor." is: Negative
