In [1]:
from sklearn.cluster import AgglomerativeClustering # type: ignore
from sklearn.mixture import GaussianMixture # type: ignore
from sklearn.cluster import DBSCAN # type: ignore
from sklearn.metrics import silhouette_score # type: ignore
from sklearn.feature_extraction.text import TfidfVectorizer  # type: ignore
import numpy as np # type: ignore
import pandas as pd # type: ignore
import matplotlib.pyplot as plt # type: ignore


In [2]:
data=pd.read_csv("clean_data.csv")
data =  data.dropna(subset=['keywords', 'genres','overview'])

data['combined_features'] = data['keywords'] + ' ' + data['genres'] + ' ' + data['overview']
data['combined_features']

0       culture clash future space war space colony so...
1       ocean drug abuse exotic island east india trad...
2       spy based novel secret agent sequel mi action ...
3       dc comics crime fighter terrorist secret ident...
4       based novel mars medallion space travel prince...
                              ...                        
4798    united states umexico barrier legs arms paper ...
4799    unknown comedy romance newlywed couple honeymo...
4800    date love first sight narration investigation ...
4801    unknown drama ambitious new york attorney sam ...
4802    obsession camcorder crush dream girl documenta...
Name: combined_features, Length: 4801, dtype: object

In [4]:
#tfidfVectorizer ile metni sayısal veriye dönüştürme
tfidf = TfidfVectorizer(stop_words ='english')
tfidf_matrix = tfidf.fit_transform(data['combined_features'])


In [4]:
#Hieararchical Clustering 
hier_clustering = AgglomerativeClustering(n_clusters= 10)
hier_labels = hier_clustering.fit_predict(tfidf_matrix.toarray())
silhouette_hier= silhouette_score(tfidf_matrix, hier_labels)
print(f"Hierarchical Clustering Silhoutte Score: {silhouette_hier}")

In [13]:
#DBSCAN
dbscan = DBSCAN(eps = 0.5, min_samples=5)
dbscan_labels = dbscan.fit_predict(tfidf_matrix.toarray())

if len(set(dbscan_labels)) > 1 and -1 not in dbscan_labels:
    silhoutte_dbscan = silhouette_score(tfidf_matrix, dbscan_labels)
else:
    silhoutte_dbscan = -1 #dbscan başarısızz  ise  -1 yazalım
print(f"DBSCAN Silhoutte Score: {silhoutte_dbscan}")

DBSCAM Silhoutte Score: -1


In [17]:
#Gaussian Mixture Model(GMM

from sklearn.decomposition import PCA # type: ignore
pca = PCA(n_components=100)
tfidf_matrix_reduced = pca.fit_transform(tfidf_matrix.toarray())

gmm = GaussianMixture(n_components=10, random_state=42)
gmm_labels= gmm.fit_predict(tfidf_matrix_reduced)
silhouette_gmm = silhouette_score(tfidf_matrix_reduced, gmm_labels)
print(f"GM Silhoutte Score: {silhouette_gmm}")

GM Silhoutte Score: -0.010837713742997463


In [18]:
#results
clustering_methods = ['Hierarchical','DBSCAN','GMM']
silhouette_scores = [silhouette_hier, silhoutte_dbscan, silhouette_gmm]

In [19]:
best_method_index = np.argmax(silhouette_scores)
best_method = clustering_methods[best_method_index]
best_score = silhouette_scores[best_method_index]


In [20]:

print(f"En iyi kümeleme yöntemi: {best_method}")
print(f"En yüksek Silhouette Skoru: {best_score}")

En iyi kümeleme yöntemi: Hierarchical
En yüksek Silhouette Skoru: -0.0030384601312127874


In [15]:
from mlxtend.frequent_patterns import fpgrowth, association_rules,apriori # type: ignore
from sklearn.feature_extraction.text import CountVectorizer # type: ignore



## FPGrowth


In [6]:
vectorizer = CountVectorizer(tokenizer = lambda x: x.split(' '), binary=True)
X = vectorizer.fit_transform(data['combined_features'])
features  = vectorizer.get_feature_names_out()

df_features = pd.DataFrame(X.toarray(), columns = features)





In [8]:
frequent_itemsets = fpgrowth(df_features, min_support = 0.02, use_colnames = True)
rules = association_rules(frequent_itemsets, metric = "lift", min_threshold = 1)
print(f"Sık kullanılan Ögeler: {frequent_itemsets}")




Sık kullanılan Ögeler:       support                     itemsets
0    0.249948                     (action)
1    0.173089                  (adventure)
2    0.113726                    (science)
3    0.112477                    (fiction)
4    0.090606                    (fantasy)
..        ...                          ...
535  0.047282          (independent, film)
536  0.035618         (independent, drama)
537  0.024995        (independent, comedy)
538  0.034160   (independent, drama, film)
539  0.024162  (independent, comedy, film)

[540 rows x 2 columns]


In [9]:
print(f"\n  Birliktelik Kuralları: {rules[['antecedents', 'consequents', 'support', 'confidence', 'lift']]}")


  Birliktelik Kuralları:              antecedents            consequents   support  confidence  \
0             (thriller)               (action)  0.112268    0.424076   
1               (action)             (thriller)  0.112268    0.449167   
2      (thriller, drama)               (action)  0.034368    0.295699   
3        (drama, action)             (thriller)  0.034368    0.460894   
4             (thriller)        (drama, action)  0.034368    0.129819   
..                   ...                    ...       ...         ...   
649  (independent, film)               (comedy)  0.024162    0.511013   
650       (comedy, film)          (independent)  0.024162    0.588832   
651        (independent)         (comedy, film)  0.024162    0.489451   
652             (comedy)    (independent, film)  0.024162    0.067052   
653               (film)  (independent, comedy)  0.024162    0.240664   

          lift  
0     1.696656  
1     1.696656  
2     1.183042  
3     1.740953  
4     1.7409

## Apriori

In [10]:
data['combined_features'] = data['combined_features'].apply(lambda x: x.split())
vectorizer = CountVectorizer(tokenizer=lambda doc: doc, lowercase=False, binary=True)
X = vectorizer.fit_transform(data['combined_features'])
df_features = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())




In [12]:
data['combined_features'].head()

0    [culture, clash, future, space, war, space, co...
1    [ocean, drug, abuse, exotic, island, east, ind...
2    [spy, based, novel, secret, agent, sequel, mi,...
3    [dc, comics, crime, fighter, terrorist, secret...
4    [based, novel, mars, medallion, space, travel,...
Name: combined_features, dtype: object

In [17]:
frequent_itemsets = apriori(df_features, min_support= 0.02, use_colnames= True)

rules = association_rules(frequent_itemsets, metric="lift", min_threshold= 1)

print(f"Sık  ullanılan Ögeler: {frequent_itemsets}")





Sık  ullanılan Ögeler:       support                               itemsets
0    0.249948                               (action)
1    0.173089                            (adventure)
2    0.022704                                  (age)
3    0.034368                                (agent)
4    0.027286                                (along)
..        ...                                    ...
535  0.020204                   (true, drama, story)
536  0.042908           (science, fiction, thriller)
537  0.027494              (science, fiction, world)
538  0.030619  (science, fiction, adventure, action)
539  0.025828   (science, fiction, action, thriller)

[540 rows x 2 columns]


In [19]:
print(f"\n Association Rules: {rules[['antecedents', 'consequents', 'support', 'confidence', 'lift']]}")


 Association Rules:             antecedents                   consequents   support  confidence  \
0           (adventure)                      (action)  0.099354    0.574007   
1              (action)                   (adventure)  0.099354    0.397500   
2               (agent)                      (action)  0.022495    0.654545   
3              (action)                       (agent)  0.022495    0.090000   
4              (action)                       (based)  0.024162    0.096667   
..                  ...                           ...       ...         ...   
649  (thriller, action)            (science, fiction)  0.025828    0.230056   
650           (science)   (thriller, fiction, action)  0.025828    0.227106   
651           (fiction)   (science, action, thriller)  0.025828    0.229630   
652            (action)  (science, fiction, thriller)  0.025828    0.103333   
653          (thriller)    (science, fiction, action)  0.025828    0.097561   

         lift  
0    2.296507 