In [6]:
import pandas as pd

# Read File Data
with open('positive.txt', 'r', encoding='utf-8') as file:
    positive_data = file.readlines()

with open('negative.txt', 'r', encoding='utf-8') as file:
    negative_data = file.readlines()

# Create dataframe with class (0 - positive, 1 - negative)
df_positive = pd.DataFrame({'text': positive_data, 'class': 0})
df_negative = pd.DataFrame({'text': negative_data, 'class': 1})

# Connect positive and negative dataframes
df = pd.concat([df_positive, df_negative], ignore_index=True)

# Shuffle dataframe
df = df.sample(frac=1).reset_index(drop=True)
print(df.head())

                                                text  class
0  Fantastyczne efekty specjalne. Film robi ogrom...      0
1  Przemyślana symbolika. Każdy element ma swoje ...      0
2  Długo zapadający w pamięć. Do tego świetna reż...      0
3      Film, który w ogóle mnie nie zainteresował...      1
4  Świetne tempo narracji. Akcja rozwija się płyn...      0


In [7]:
# 1. Zaimportuj wszystkie potrzebne biblioteki

import pandas as pd 
from sklearn.utils import shuffle 
from sklearn.feature_extraction.text import CountVectorizer 
from sklearn.model_selection import train_test_split, cross_val_score 
from sklearn.pipeline import Pipeline 
from sklearn.ensemble import RandomForestClassifier 
from sklearn.tree import DecisionTreeClassifier 
from sklearn.svm import SVC 
from sklearn.metrics import accuracy_score, classification_report 

In [8]:
# 2. Podziel dane na zbiór treningowy i testowy
# (Ważne!!! Zbioru testowego nie można zmieniać w trakcie). 
# Zbiór testowy będzie złożony z 20% całości danych (test_size)
X_train, X_test, y_train, y_test = train_test_split(df['text'], df['class'], 
test_size=0.2, random_state=42) 

# 3. Utwórz pipeline klasyfikatorów
classifiers = [ 
    ('Decision Tree', DecisionTreeClassifier()), 
    ('Random Forest', RandomForestClassifier(n_estimators=100, 
random_state=42)), 
    ('SVM', SVC()) 
]

In [9]:
# 4. Utwórz pętlę która przetestuje wszystkie klasyfikatory
results = [] 
 
for classifier_name, classifier in classifiers: 
    # Utwórz pipeline z CountVectorizer i klasyfikatorem 
    pipeline = Pipeline([ 
        ('vectorizer', CountVectorizer()), 
        ('classifier', classifier) 
    ]) 

    cv_scores = cross_val_score(pipeline, X_train, y_train, cv=5, scoring='accuracy') 
 
    # Trenuj model 
    pipeline.fit(X_train, y_train) 

    # Przewiduj na danych testowych 
    y_pred = pipeline.predict(X_test) 

    # Oceniaj wyniki 
    accuracy = accuracy_score(y_test, y_pred) 
    report = classification_report(y_test, y_pred) 

    # Dodaj wyniki do listy 
    results.append({ 
        'Classifier': classifier_name, 
        'Mean Accuracy': cv_scores.mean(), 
        'Cross-Validation Scores': cv_scores, 
        'Classification Report': report 
    })

# 5. Wyświetl wyniki
for result in results: 
    print(f"Classifier: {result['Classifier']}") 
    print(f"Cross-Validation Scores: {result['Cross-Validation Scores']}") 
    print(f"Mean CV Accuracy: {result['Mean Accuracy']:.4f}") 
    print("Classification Report:") 
    print(result['Classification Report']) 
    print("=" * 50)

  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is

Classifier: Decision Tree
Cross-Validation Scores: [0.7        0.86666667 0.66666667 0.65517241 0.68965517]
Mean CV Accuracy: 0.7156
Classification Report:
              precision    recall  f1-score   support

           0       0.80      0.95      0.87        21
           1       0.92      0.69      0.79        16

    accuracy                           0.84        37
   macro avg       0.86      0.82      0.83        37
weighted avg       0.85      0.84      0.83        37

Classifier: Random Forest
Cross-Validation Scores: [0.73333333 0.73333333 0.73333333 0.72413793 0.79310345]
Mean CV Accuracy: 0.7434
Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.48      0.65        21
           1       0.59      1.00      0.74        16

    accuracy                           0.70        37
   macro avg       0.80      0.74      0.69        37
weighted avg       0.82      0.70      0.69        37

Classifier: SVM
Cross-Validation S

  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is