In [7]:
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt


# Download Imdb datasets
movies = pd.read_csv("./Datasets/IMDb movies.csv")
series = pd.read_csv("./Datasets/IMDb TV series.csv")

# Align Rating Columns
movies['rating'] = movies['avg_vote']  # avg_vote → rating
# series zaten 'rating' içeriyor (doğrudan kullanılabilir)
movies.drop(columns=['avg_vote'], inplace=True, errors='ignore')  # Eski kolon silinir (varsa)

# Specify Source Type
movies['type'] = 'movie'
series['type'] = 'tv_series'

# Merge Based on Common Columns
common_cols = list(set(movies.columns).intersection(set(series.columns)))
imdb = pd.concat([movies[common_cols], series[common_cols]], ignore_index=True)

# General Cleaning Function (removes special characters)
def clean_text(text):
    if pd.notnull(text):
        text = text.lower()
        text = re.sub(r'[^\w\s]', '', text)  # harf ve sayılar dışındaki her şeyi kaldır
        text = re.sub(r'\s+', ' ', text)     # fazla boşlukları tek boşluğa indir
        return text.strip()
    return ""

# Clean Columns
imdb['title'] = imdb['title'].apply(clean_text)
imdb['genre'] = imdb['genre'].apply(clean_text)
imdb['description'] = imdb['description'].apply(clean_text)

# Standardize Type Column (tv_series → tv show)
imdb['type'] = imdb['type'].str.lower().str.strip().replace({'tv_series': 'tv show', 'movie': 'movie'})

# Clean Year Format
imdb['year'] = imdb['year'].astype(str).str.extract(r'(\d{4})')
imdb['year'] = pd.to_numeric(imdb['year'], errors='coerce').fillna(0).astype(int)

# Save as CSV
output_path = "./processed_datasets/cleaned_imdb_combined.csv"
imdb.to_csv(output_path, index=False)
print("✅ IMDb verisi başarıyla kaydedildi:", output_path)

# Show first five row
imdb.head(5)

✅ IMDb verisi başarıyla kaydedildi: ./processed_datasets/cleaned_imdb_combined.csv


Unnamed: 0,title,type,duration,genre,year,votes,description,rating
0,the story of the kelly gang,movie,70,biography crime drama,1906,537,true story of notorious australian outlaw ned ...,6.1
1,den sorte drøm,movie,53,drama,1911,171,two men of high rank are both wooing the beaut...,5.9
2,cleopatra,movie,100,drama history,1912,420,the fabled queen of egypts affair with roman g...,5.2
3,linferno,movie,68,adventure drama fantasy,1911,2019,loosely adapted from dantes divine comedy and ...,7.0
4,from the manger to the cross or jesus of nazareth,movie,60,biography drama,1912,438,an account of the life of jesus christ based o...,5.7


In [8]:
# Load datasets
netflix = pd.read_csv("./Datasets/netflix_titles.csv")
amazon = pd.read_csv("./Datasets/amazon_prime_titles.csv")
disney = pd.read_csv("./Datasets/disney_plus_titles.csv")

# Add Platform Information
netflix['platform'] = 'netflix'
amazon['platform'] = 'amazon'
disney['platform'] = 'disney'

# Normalize Genre Information
netflix['type'] = netflix['type'].str.lower().str.strip()
amazon['type'] = amazon['type'].str.lower().str.strip()
disney['type'] = disney['type'].str.lower().str.strip()

# Merge Based on Common Columns
common_cols = list(set(netflix.columns) & set(amazon.columns) & set(disney.columns))
platforms = pd.concat([
    netflix[common_cols],
    amazon[common_cols],
    disney[common_cols]
], ignore_index=True)

# Cleaning
platforms['title'] = platforms['title'].str.lower().str.strip()
platforms['title'] = platforms['title'].apply(lambda x: re.sub(r'[^\w\s]', '', x) if pd.notnull(x) else x)
platforms['cast'] = platforms['cast'].str.lower().str.strip()
platforms['listed_in'] = platforms['listed_in'].str.lower().str.strip()
platforms['description'] = platforms['description'].str.lower().str.strip()

# Release_year → year
platforms.rename(columns={'release_year': 'year', 'listed_in': 'genre',  }, inplace=True)

# Drop Unnecessary Columns
platforms.drop(columns=['rating', 'cast', 'country', 'duration', 'show_id', 'director', 'date_added'],
               inplace=True, errors='ignore')

# Save file
output_path = "./processed_datasets/Netflix_Disney_Amazon.csv"
platforms.to_csv(output_path, index=False)
print("✅ Platform verisi başarıyla kaydedildi:", output_path)

# Show first 5 rows
platforms.head(5)

✅ Platform verisi başarıyla kaydedildi: ./processed_datasets/Netflix_Disney_Amazon.csv


Unnamed: 0,type,title,description,genre,year,platform
0,movie,dick johnson is dead,"as her father nears the end of his life, filmm...",documentaries,2020,netflix
1,tv show,blood water,"after crossing paths at a party, a cape town t...","international tv shows, tv dramas, tv mysteries",2021,netflix
2,tv show,ganglands,to protect his family from a powerful drug lor...,"crime tv shows, international tv shows, tv act...",2021,netflix
3,tv show,jailbirds new orleans,"feuds, flirtations and toilet talk go down amo...","docuseries, reality tv",2021,netflix
4,tv show,kota factory,in a city of coaching centers known to train i...,"international tv shows, romantic tv shows, tv ...",2021,netflix


In [9]:
# Load Imdb dataset(cleaned)
imdb = pd.read_csv("./processed_datasets/cleaned_imdb_combined.csv")

# Load Platform Data (clean version)
platforms = pd.read_csv("./processed_datasets/Netflix_Disney_Amazon.csv")

# Normalize Titles (convert to lowercase + fix spacing)
def normalize_title(text):
    if pd.notnull(text):
        text = text.lower().strip()
        text = re.sub(r'\s+', ' ', text)
        return text
    return ""

imdb['title'] = imdb['title'].apply(normalize_title)
platforms['title'] = platforms['title'].apply(normalize_title)

# Group Platforms with the Same Title
platform_mapping = platforms.groupby('title')['platform'].apply(lambda x: ', '.join(sorted(set(x)))).reset_index()

# Add Platform Information to IMDb
imdb.drop(columns=['platform'], inplace=True, errors='ignore')
imdb = imdb.merge(platform_mapping, on='title', how='left')
imdb['platform'] = imdb['platform'].fillna("np")
# === 5.1 Platform bilgisine göre binary etiket oluştur
imdb['platform_flag'] = imdb['platform'].apply(lambda x: 0 if x == 'np' else 1)

# Remove Missing Description, Rating, and Duration Data
imdb = imdb[imdb['description'].notna()]
imdb = imdb[imdb['description'].str.strip() != ""]
imdb = imdb[imdb['rating'].notna()]
imdb = imdb[imdb['duration'].notna()]

# Save the file
output_path = "./processed_datasets/imdb_with_platforms.csv"
imdb.to_csv(output_path, index=False)
print("✅ IMDb + Platform verisi başarıyla birleştirildi ve NaN satırlar atıldı:", output_path)

# Show first five rows
print(imdb[['title', 'platform']].head(5))

  imdb = pd.read_csv("./processed_datasets/cleaned_imdb_combined.csv")


✅ IMDb + Platform verisi başarıyla birleştirildi ve NaN satırlar atıldı: ./processed_datasets/imdb_with_platforms.csv
                                               title platform
0                        the story of the kelly gang       np
1                                     den sorte drøm       np
2                                          cleopatra       np
3                                           linferno       np
4  from the manger to the cross or jesus of nazareth       np


In [10]:
# Change votes column to integer
imdb['votes'] = imdb['votes'].astype(str).str.replace(",", "").str.strip()
imdb['votes'] = pd.to_numeric(imdb['votes'], errors='coerce')

# Change duration column to integer 
imdb['duration'] = imdb['duration'].astype(str).str.extract(r'(\d+)')
imdb['duration'] = pd.to_numeric(imdb['duration'], errors='coerce')
imdb.to_csv(output_path, index=False)
imdb[['votes', 'duration']].dtypes

votes       int64
duration    int64
dtype: object

In [11]:
# Remove unnecessary columns
imdb.drop(columns=['type_binary', 'type_tv show'], inplace=True, errors='ignore')

# If the original 'type' column has been re-added or exists, convert it
# Movie = 0, TV Show = 1
imdb['type'] = imdb['type'].str.lower().str.strip()
imdb['type'] = imdb['type'].apply(lambda x: 0 if x == 'movie' else 1)

# Split the 'genre' column based on spaces

genre_split = imdb['genre'].dropna().str.lower().str.split()

# Collect unique genres
unique_genres = set()
for genres in genre_split:
    unique_genres.update(genres)

# create column for every genre
for genre in unique_genres:
    imdb[f"genre_{genre}"] = imdb['genre'].astype(str).str.lower().apply(lambda x: int(genre in x.split()))
    
imdb.to_csv(output_path, index=False)
print (imdb)

                                                   title  type  duration  \
0                            the story of the kelly gang     0        70   
1                                         den sorte drøm     0        53   
2                                              cleopatra     0       100   
3                                               linferno     0        68   
4      from the manger to the cross or jesus of nazareth     0        60   
...                                                  ...   ...       ...   
91225                                     the imperfects     1        45   
91226                                   the walking dead     1        44   
91227                                          the crown     1        58   
91228                                       supernatural     1        44   
91229                                      devil in ohio     1       356   

                         genre  year   votes  \
0        biography crime drama  1906   

In [12]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

# Load dataset
imdb = pd.read_csv("./processed_datasets/imdb_with_platforms.csv")  # Yolun göreceli olduğuna emin ol

# Fill the empty cells with ("")
descriptions = imdb['description'].fillna("")

# Create a TF-IDF vectorizer (filter English stopwords)
tfidf = TfidfVectorizer(stop_words='english', max_features=500)

# Create TF-IDF matrix
tfidf_matrix = tfidf.fit_transform(descriptions)

# Create dataframe from matrix
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf.get_feature_names_out())

# First five result
print(tfidf_df.head())

   abandoned  accident  accidentally  accused  action  actor  actress  \
0        0.0       0.0           0.0      0.0     0.0    0.0      0.0   
1        0.0       0.0           0.0      0.0     0.0    0.0      0.0   
2        0.0       0.0           0.0      0.0     0.0    0.0      0.0   
3        0.0       0.0           0.0      0.0     0.0    0.0      0.0   
4        0.0       0.0           0.0      0.0     0.0    0.0      0.0   

   adventure  adventures  affair  ...  working  works  world  writer  wrong  \
0        0.0         0.0     0.0  ...      0.0    0.0    0.0     0.0    0.0   
1        0.0         0.0     0.0  ...      0.0    0.0    0.0     0.0    0.0   
2        0.0         0.0     1.0  ...      0.0    0.0    0.0     0.0    0.0   
3        0.0         0.0     0.0  ...      0.0    0.0    0.0     0.0    0.0   
4        0.0         0.0     0.0  ...      0.0    0.0    0.0     0.0    0.0   

   year  years  york  young  younger  
0   0.0    0.0   0.0    0.0      0.0  
1   0.0 