In [12]:
import pandas as pd

# Read File Data
with open('positive.txt', 'r', encoding='utf-8') as file:
    positive_data = file.readlines()

with open('negative.txt', 'r', encoding='utf-8') as file:
    negative_data = file.readlines()

# Create dataframe with class (0 - positive, 1 - negative)
df_positive = pd.DataFrame({'text': positive_data, 'class': 0})
df_negative = pd.DataFrame({'text': negative_data, 'class': 1})

# Connect positive and negative dataframes
df = pd.concat([df_positive, df_negative], ignore_index=True)

# Shuffle dataframe
df = df.sample(frac=1).reset_index(drop=True)
print(df.head())

                                                text  class
0  Wspaniała atmosfera. Film sprawia, że czuję si...      0
1  Przesadne naciąganie fabuły. Niektóre wydarzen...      1
2  Wciągająca fabuła. Trudno przewidzieć, co wyda...      0
3      Nieudane efekty specjalne. Widać, że budże...      1
4  Znakomita gra aktorska. Aktorzy wnoszą do swoi...      0


In [13]:
# 1. Zaimportuj wszystkie potrzebne biblioteki

import pandas as pd 
from sklearn.utils import shuffle 
from sklearn.feature_extraction.text import CountVectorizer 
from sklearn.model_selection import train_test_split, cross_val_score 
from sklearn.pipeline import Pipeline 
from sklearn.ensemble import RandomForestClassifier 
from sklearn.tree import DecisionTreeClassifier 
from sklearn.svm import SVC 
from sklearn.metrics import accuracy_score, classification_report 

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [14]:
# 2. Podziel dane na zbiór treningowy i testowy
# (Ważne!!! Zbioru testowego nie można zmieniać w trakcie). 
# Zbiór testowy będzie złożony z 20% całości danych (test_size)
X_train, X_test, y_train, y_test = train_test_split(df['text'], df['class'], 
test_size=0.2, random_state=42) 

# 3. Utwórz pipeline klasyfikatorów
classifiers = [ 
    ('Decision Tree', DecisionTreeClassifier()), 
    ('Random Forest', RandomForestClassifier(n_estimators=100, 
random_state=42)), 
    ('SVM', SVC()) 
]

vectorizers = [
    ('Words with at least 3 letters', CountVectorizer(token_pattern=r'\b\w{3,}\b')),
    ('Words with 3-8 letters', CountVectorizer(token_pattern=r'\b\w{3,8}\b')),
    ('Words with 3-10 letters', CountVectorizer(token_pattern=r'\b\w{3,10}\b')),
    ('Words with 3-12 letters', CountVectorizer(token_pattern=r'\b\w{3,12}\b')),
    ('Words with 3-12 letters', CountVectorizer(token_pattern=r'\b\w{3,16}\b')),
    
]

In [15]:
# 4. Utwórz pętlę która przetestuje wszystkie klasyfikatory
results = [] 
 
for classifier_name, classifier in classifiers: 
    for vectorizer_name, vectorizer in vectorizers:
        # Utwórz pipeline z CountVectorizer i klasyfikatorem 
        pipeline = Pipeline([ 
            ('vectorizer', vectorizer), 
            ('classifier', classifier) 
        ]) 

        cv_scores = cross_val_score(pipeline, X_train, y_train, cv=5, scoring='accuracy') 
    
        # Trenuj model 
        pipeline.fit(X_train, y_train) 

        # Przewiduj na danych testowych 
        y_pred = pipeline.predict(X_test) 

        # Oceniaj wyniki 
        accuracy = accuracy_score(y_test, y_pred) 
        report = classification_report(y_test, y_pred) 

        # Dodaj wyniki do listy 
        results.append({ 
            'Classifier': classifier_name, 
            'Vectorizer': vectorizer_name,
            'Mean Accuracy': cv_scores.mean(), 
            'Test Accuracy': accuracy,
            'Cross-Validation Scores': cv_scores, 
            'Classification Report': report 
        })

# 5. Wyświetl wyniki
best_methods = []

for result in results: 
    best_methods.append((result['Test Accuracy'], f"{result['Classifier']}", f"{result['Vectorizer']}"))
    print(f"Classifier: {result['Classifier']}") 
    print(f"Vectorizer: {result['Vectorizer']}")
    print(f"Cross-Validation Scores: {result['Cross-Validation Scores']}") 
    print(f"Mean CV Accuracy: {result['Mean Accuracy']:.4f}") 
    print(f"Accuracy: {result['Test Accuracy']:.4f}") 
    print("Classification Report:") 
    print(result['Classification Report']) 
    print("=" * 50, "\n")



Classifier: Decision Tree
Vectorizer: Words with at least 3 letters
Cross-Validation Scores: [0.76666667 0.9        0.66666667 0.82758621 0.79310345]
Mean CV Accuracy: 0.7908
Accuracy: 0.7027
Classification Report:
              precision    recall  f1-score   support

           0       0.74      0.70      0.72        20
           1       0.67      0.71      0.69        17

    accuracy                           0.70        37
   macro avg       0.70      0.70      0.70        37
weighted avg       0.70      0.70      0.70        37


Classifier: Decision Tree
Vectorizer: Words with 3-8 letters
Cross-Validation Scores: [0.8        0.86666667 0.73333333 0.82758621 0.79310345]
Mean CV Accuracy: 0.8041
Accuracy: 0.7297
Classification Report:
              precision    recall  f1-score   support

           0       0.75      0.75      0.75        20
           1       0.71      0.71      0.71        17

    accuracy                           0.73        37
   macro avg       0.73      0.

In [16]:
# Zastanów się który algorytm działa najlepiej! 
best_methods = sorted(best_methods, key = lambda x : x[0], reverse=True)

print('BEST ALGORITHMS')
print()
for best in best_methods:
    print(f"{best[0]:.4f} - {best[1]} + {best[2]}")

BEST ALGORITHMS

0.7838 - Random Forest + Words with 3-12 letters
0.7838 - SVM + Words with 3-12 letters
0.7568 - Random Forest + Words with 3-8 letters
0.7568 - SVM + Words with at least 3 letters
0.7568 - SVM + Words with 3-10 letters
0.7297 - Decision Tree + Words with 3-8 letters
0.7297 - Decision Tree + Words with 3-12 letters
0.7297 - SVM + Words with 3-8 letters
0.7027 - Decision Tree + Words with at least 3 letters
0.7027 - Decision Tree + Words with 3-10 letters
0.7027 - Decision Tree + Words with 3-12 letters
0.7027 - Random Forest + Words with at least 3 letters
0.7027 - Random Forest + Words with 3-10 letters
0.7027 - SVM + Words with 3-12 letters
0.6757 - Random Forest + Words with 3-12 letters
