In [1]:
#Piotr Skorulski (24494)
#Christian Pospiech (25659)
#Mateusz Szymański (25702)

import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Przetwarzanie tekstu
def preprocess_text(text):
    # Usuwanie znaków specjalnych i liczb
    text = re.sub(r"[^a-zA-Z]", " ", text)
    # Normalizacja tekstu (zamiana na małe litery)
    text = text.lower()
    # Usuwanie stop words
    stop_words = set(stopwords.words("english"))
    words = text.split()
    words = [word for word in words if word not in stop_words]
    # Lematyzacja (przekształcanie słów do ich podstawowej formy)
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]
    return " ".join(words)

# Przygotowanie danych treningowych
data = pd.read_csv('dane_treningowe.csv')


In [2]:
print(data.head())
print(data.describe())

                                               tekst  \
0  The film opens with two bandits breaking into ...   
1  The film is about a family who move to the sub...   
2  The Rarebit Fiend gorges on Welsh rarebit at a...   
3  The film features a train traveling through th...   
4  Irish villager Kathleen is a tenant of Captain...   

                      gatunek  
0                     western  
1                      comedy  
2                       short  
3  short action/crime western  
4                  short film  
                                                    tekst gatunek
count                                               28768   28768
unique                                              28320    2241
top     The films take place three years after the eve...   drama
freq                                                    5    5991


In [3]:
#analiza gatunku
gatunek_column = data['gatunek']
unique_genres = gatunek_column.unique()
genre_list = unique_genres.tolist()
print(len(genre_list))

2241


In [4]:
genre_counts = data['gatunek'].value_counts()
sorted_genres = genre_counts.sort_values(ascending=False)
print(sorted_genres)

gatunek
drama                     5991
comedy                    4398
horror                    1172
action                    1119
thriller                   984
                          ... 
epic film                    1
comedy, crime, romance       1
sports/social                1
found footage                1
horror romantic comedy       1
Name: count, Length: 2241, dtype: int64


In [5]:
threshold = 10  #ustawienie od jakiej liczności w dół będą usuwane elementy
genres_to_delete = sorted_genres[sorted_genres < threshold].index.tolist()
data = data[~data['gatunek'].isin(genres_to_delete)]

In [6]:
genres_column = data['gatunek']
print(len(genres_column.unique()))

165


In [7]:
data['tekst'] = data['tekst'].apply(preprocess_text)
X = data['tekst']
y = data['gatunek']

In [8]:
# Podział danych na zbiór treningowy i testowy
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Wektoryzacja tekstu
vectorizer = TfidfVectorizer()
X_train_vectors = vectorizer.fit_transform(X_train)
X_test_vectors = vectorizer.transform(X_test)

In [9]:
# Trenowanie klasyfikatora
classifier1 = LinearSVC()
classifier1.fit(X_train_vectors, y_train)

In [10]:
# Predykcja na danych testowych
predictions = classifier1.predict(X_test_vectors)

In [11]:
# Sprawdzenie dokładności, precyzji, odzysku i F1 Score
accuracy = accuracy_score(y_test, predictions)
precision = precision_score(y_test, predictions, average='weighted')
recall = recall_score(y_test, predictions, average='weighted')
f1 = f1_score(y_test, predictions, average='weighted')

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

Accuracy: 0.4158102766798419
Precision: 0.34558075918251535
Recall: 0.4158102766798419
F1 Score: 0.3652734990105121


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [14]:
# Trenowanie klasyfikatora - RandomForestClassifier
from sklearn.ensemble import RandomForestClassifier
classifier2 = RandomForestClassifier()
classifier2.fit(X_train_vectors, y_train)

# Predykcja na danych testowych
predictions = classifier2.predict(X_test_vectors)

In [15]:
# Sprawdzenie dokładności, precyzji, odzysku i F1 Score
accuracy = accuracy_score(y_test, predictions)
precision = precision_score(y_test, predictions, average='weighted')
recall = recall_score(y_test, predictions, average='weighted')
f1 = f1_score(y_test, predictions, average='weighted')

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

Accuracy: 0.3383399209486166
Precision: 0.33067276469478324
Recall: 0.3383399209486166
F1 Score: 0.24376947487678832


  _warn_prf(average, modifier, msg_start, len(result))


In [17]:
# Wprowadzenie opisu filmu przez użytkownika
opis = input("Wprowadź opis filmu: ")
opis = preprocess_text(opis)

# Wektoryzacja opisu wprowadzonego przez użytkownika
opis_vector = vectorizer.transform([opis])

# Predykcja gatunku filmowego
prediction = classifier1.predict(opis_vector)

print("Przewidywany gatunek filmu: ", prediction)

Wprowadź opis filmu: The end of the road begins. Fast X, the tenth film in the Fast and Furious Saga, launches the final chapters of one of cinema's most storied and popular global franchises, now in its third decade and still going strong with the same core cast and characters as when it began. Over many missions and against impossible odds, Dom Toretto (Vin Diesel) and his family have outsmarted, out-nerved and outdriven every foe in their path. Now, they confront the most lethal opponent they've ever faced: A terrifying threat emerging from the shadows of the past who's fueled by blood revenge, and who is determined to shatter this family and destroy everything-and everyone-that Dom loves, forever. In 2011's Fast Five, Dom and his crew took out nefarious Brazilian drug kingpin Hernan Reyes and decapitated his empire on a bridge in Rio De Janeiro. What they didn't know was that Reyes' son, Dante (Aquaman's Jason Momoa), witnessed it all and has spent the last 12 years masterminding a

In [18]:
# Zmiana wartości parametru C
classifier_svc = LinearSVC(C=0.1)
#ponowne trenowanie
classifier_svc.fit(X_train_vectors, y_train)

#Zmiana wartości parametru dla RFC:
classifier_rf = RandomForestClassifier(n_estimators=100)  # Zmiana wartości parametru n_estimators
#ponowne trenowanie
classifier_rf.fit(X_train_vectors, y_train)

In [19]:
# Predykcja na danych testowych dla modelu LinearSVC
predictions_svc = classifier_svc.predict(X_test_vectors)

# Wyznaczanie metryk dla modelu LinearSVC
accuracy_svc = accuracy_score(y_test, predictions_svc)
precision_svc = precision_score(y_test, predictions_svc, average='weighted')
recall_svc = recall_score(y_test, predictions_svc, average='weighted')
f1_svc = f1_score(y_test, predictions_svc, average='weighted')

# Wyświetlanie wyników dla modelu LinearSVC
print("LinearSVC Classifier:")
print("Accuracy: ", accuracy_svc)
print("Precision: ", precision_svc)
print("Recall: ", recall_svc)
print("F1 Score: ", f1_svc)

# Predykcja na danych testowych dla modelu RandomForestClassifier
predictions_rf = classifier_rf.predict(X_test_vectors)

# Wyznaczanie metryk dla modelu RandomForestClassifier
accuracy_rf = accuracy_score(y_test, predictions_rf)
precision_rf = precision_score(y_test, predictions_rf, average='weighted')
recall_rf = recall_score(y_test, predictions_rf, average='weighted')
f1_rf = f1_score(y_test, predictions_rf, average='weighted')

# Wyświetlanie wyników dla modelu RandomForestClassifier
print("RandomForestClassifier:")
print("Accuracy: ", accuracy_rf)
print("Precision: ", precision_rf)
print("Recall: ", recall_rf)
print("F1 Score: ", f1_rf)

  _warn_prf(average, modifier, msg_start, len(result))


LinearSVC Classifier:
Accuracy:  0.42628458498023714
Precision:  0.35863825242892067
Recall:  0.42628458498023714
F1 Score:  0.34330285157624135
RandomForestClassifier:
Accuracy:  0.33893280632411066
Precision:  0.3271482158350674
Recall:  0.33893280632411066
F1 Score:  0.2420819513625008


  _warn_prf(average, modifier, msg_start, len(result))


In [20]:
# Wprowadzenie opisu filmu przez użytkownika
opis = input("Wprowadź opis filmu: ")
opis = preprocess_text(opis)

# Wektoryzacja opisu wprowadzonego przez użytkownika
opis_vector = vectorizer.transform([opis])

# Predykcja gatunku filmowego
prediction = classifier_svc.predict(opis_vector)

print("Przewidywany gatunek filmu: ", prediction)

Wprowadź opis filmu: John J. Rambo is a former United States Special Forces soldier who fought in Vietnam and won the Congressional Medal of Honor, but his time in Vietnam still haunts him. As he came to Hope, Washington to visit a friend, he was guided out of town by the Sheriff William Teasel who insults Rambo, but what Teasel does not know that his insult angered Rambo to the point where Rambo became violent and was arrested. As he was at the county jail being cleaned, he escapes and goes on a rampage through the forest to try to escape from the sheriffs who want to kill him. Then, as Rambo's commanding officer, Colonel Samuel Trautman tries to save both the Sheriff's department and Rambo before the situation gets out of hand.
Przewidywany gatunek filmu:  ['action']


In [22]:
# Wyeksportowanie modeluSVC do pliku
import pickle
with open('modelSVC.pkl', 'wb') as file:
    pickle.dump(classifier_svc, file)

In [23]:
# Wyeksportowanie modeluRFC do pliku
with open('modelSVC.pkl', 'wb') as file:
    pickle.dump(classifier_rf, file)