# Arkadiusz Pytlik

In [1]:
import sqlalchemy
import pandas as pd
from sqlalchemy import create_engine
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix


db_string = "postgresql://wbauer_adb:adb2020@pgsql-196447.vipserv.org:5432/wbauer_classifier_abd"
db = create_engine(db_string)

In [2]:
# import danych z bazy
select_string = ('SELECT setseed(0.5); '
                'SELECT movies.movie_name title, movies.plot, genres.name genre FROM movies '
                'LEFT JOIN genres_movies ON movies.movie_id = genres_movies.movie_id '
                'LEFT JOIN genres ON genres_movies.genre_id = genres.genre_id '
                'WHERE random()>0.9')
            
df=pd.read_sql(select_string,con=db)
df

Unnamed: 0,title,plot,genre
0,Flåklypa Grand Prix,"In the town of Flåklypa , the inventor Reodo...",Stop motion
1,Star Trek II: The Wrath of Khan,The film opens with Lieutenant Saavik in comm...,Thriller
2,The Big Lebowski,"Jeff ""The Dude"" Lebowski returns home only to ...",Crime Fiction
3,Taxi Driver,"Travis Bickle , an honorably discharged U.S. M...",Thriller
4,Night of the Living Dead,The story begins with the siblings Barbra and...,Horror
...,...,...,...
4290,Love and Curses,"An old hero, Harold, is looking through a phot...",Short Film
4291,"Shake, Rattle and Roll: An American Love Story","In the 1950s, Lyne Danner, whose father was in...",Drama
4292,The Life of Chikuzan,The real Chikuzan appears on a stage in a smal...,
4293,Shariyo Thetto,{{expand-section}} The story revolves around A...,Drama


In [3]:
# ilość filmów z danej kategorii
genres = df.groupby(by='genre').agg('count')
genres = genres.sort_values('title', ascending=False)
genres.head(10)

Unnamed: 0_level_0,title,plot
genre,Unnamed: 1_level_1,Unnamed: 2_level_1
Drama,550,550
Thriller,527,527
Crime Fiction,320,320
Short Film,308,308
Romantic comedy,198,198
Romance Film,187,187
Horror,122,122
Comedy,116,116
Action/Adventure,112,112
Science Fiction,95,95


In [4]:
g = genres.index[0]
print(f"Wybrany gatunek: {g}")

Wybrany gatunek: Drama


In [5]:
# kolumna z 1 gdy film jest tego gatunku
df[g] = df['genre'] == g
df

Unnamed: 0,title,plot,genre,Drama
0,Flåklypa Grand Prix,"In the town of Flåklypa , the inventor Reodo...",Stop motion,False
1,Star Trek II: The Wrath of Khan,The film opens with Lieutenant Saavik in comm...,Thriller,False
2,The Big Lebowski,"Jeff ""The Dude"" Lebowski returns home only to ...",Crime Fiction,False
3,Taxi Driver,"Travis Bickle , an honorably discharged U.S. M...",Thriller,False
4,Night of the Living Dead,The story begins with the siblings Barbra and...,Horror,False
...,...,...,...,...
4290,Love and Curses,"An old hero, Harold, is looking through a phot...",Short Film,False
4291,"Shake, Rattle and Roll: An American Love Story","In the 1950s, Lyne Danner, whose father was in...",Drama,True
4292,The Life of Chikuzan,The real Chikuzan appears on a stage in a smal...,,False
4293,Shariyo Thetto,{{expand-section}} The story revolves around A...,Drama,True


In [36]:
# stworzenie model bag of words

vectorizer = CountVectorizer(strip_accents='ascii', stop_words='english', lowercase=True, max_features=20)

X = vectorizer.fit_transform(df['plot'])
y = df['Drama']

# podział na zbiory treningowy i testowy
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.33, random_state=42)

# Tworzenie modelu
model = LogisticRegression().fit(X_train, y_train)

# Predykcja czesci testowej
y_pred = model.predict(X_test)

#confusion matrix
cm = confusion_matrix(y_test, y_pred)
print("confusion matrix:")
print(cm)

fp = cm[0,1]/sum(cm[0]) * 100
fn = cm[1,0]/sum(cm[1]) * 100
print("\nSzansa na zwrócenie wyniku fałszywie pozytywnego: {a:.2f}%".format(a = fp))
print("Szansa na zwrócenie wyniku fałszywie negatywnego: {a:.2f}%\n".format(a = fn))

pp = model.predict_proba(X_test)
print("predict proba:")
print(pp)


confusion matrix:
[[1246    2]
 [ 170    0]]

Szansa na zwrócenie wyniku fałszywie pozytywnego: 0.16%
Szansa na zwrócenie wyniku fałszywie negatywnego: 100.00%

predict proba:
[[0.82623814 0.17376186]
 [0.8302811  0.1697189 ]
 [0.85983045 0.14016955]
 ...
 [0.86109806 0.13890194]
 [0.9295123  0.0704877 ]
 [0.8441937  0.1558063 ]]


Model dobrze klasyfikuje wartości negatywne, natomiast kompletnie nie radzi sobie z przyporządkowaniem wartosci pozytywnych.