# Preprocessing

In [1]:
import pandas as pd

df1 = pd.read_csv("mpst_full_data.csv") 
df2 = pd.read_csv("wiki_movie_plots_deduped.csv")
df3 = pd.read_csv("movie_plots_dataset_by_sidhant_yadav.csv")
df4 = pd.read_csv("movie.csv")
df_large_genres = pd.read_csv('movies.csv')

In [2]:
def check_dataset_issues(df):
    print("=== Dataset Quality Check ===")
    
    print("\n1. Missing Values (NaN/None):")
    null_counts = df.isnull().sum()
    if null_counts.sum() == 0:
        print("✅ No missing values found.")
    else:
        print("Columns with missing values:")
        print(null_counts[null_counts > 0].to_string())
    
    print("\n2. 'unknown' or 'n/a' values in text columns:")
    text_columns = df.select_dtypes(include='object').columns
    if len(text_columns) == 0:
        print("✅ No text columns to check.")
    else:
        issues_found = False
        for col in text_columns:
            count = df[col].str.contains('unknown|n/a', case=False, na=False).sum()
            if count > 0:
                print(f"⚠️ {col}: {count} instances of 'unknown/n/a'")
                issues_found = True
        if not issues_found:
            print("✅ No 'unknown' or 'n/a' values found.")
    
    if 'genres' in df.columns:
        print("\n3. Empty genre lists:")
        empty_genres = df['genres'].apply(lambda x: len(x) == 0).sum()
        if empty_genres == 0:
            print("✅ No empty genre lists.")
        else:
            print(f"⚠️ {empty_genres} movies with empty genre lists")
    
    print("\n=== Check complete ===")

#### Preprocessing of `mpst_full_data.csv`

In [3]:
df1.shape

(14828, 6)

In [4]:
df1 = df1.drop(columns=["imdb_id", "split", "synopsis_source"])

df1["tags"] = (
    df1["tags"]
    .str.split(", ")
    .apply(lambda x: [tag.lower().strip() for tag in x] if isinstance(x, list) else [])
)

df1 = df1.rename(columns={'title': 'title', 'plot_synopsis': 'plot', 'tags': 'genres'}) 

df1.head()

Unnamed: 0,title,plot,genres
0,I tre volti della paura,Note: this synopsis is for the orginal Italian...,"[cult, horror, gothic, murder, atmospheric]"
1,Dungeons & Dragons: The Book of Vile Darkness,"Two thousand years ago, Nhagruul the Foul, a s...",[violence]
2,The Shop Around the Corner,"Matuschek's, a gift store in Budapest, is the ...",[romantic]
3,Mr. Holland's Opus,"Glenn Holland, not a morning person by anyone'...","[inspiring, romantic, stupid, feel-good]"
4,Scarface,"In May 1980, a Cuban man named Tony Montana (A...","[cruelty, murder, dramatic, cult, violence, at..."


In [5]:
check_dataset_issues(df1)

=== Dataset Quality Check ===

1. Missing Values (NaN/None):
✅ No missing values found.

2. 'unknown' or 'n/a' values in text columns:
⚠️ title: 6 instances of 'unknown/n/a'
⚠️ plot: 1141 instances of 'unknown/n/a'

3. Empty genre lists:
✅ No empty genre lists.

=== Check complete ===


#### Preprocessing of `wiki_movie_plots_deduped.csv`

In [6]:
df2.shape

(34886, 8)

In [7]:
df2 = df2.drop(columns=["Release Year", "Origin/Ethnicity", "Director", "Cast", "Wiki Page"])

df2["Genre"] = (
    df2["Genre"]
    .str.split(", ")
    .apply(lambda x: [tag.lower().strip() for tag in x] if isinstance(x, list) else [])
)

df2 = df2.rename(columns={'Title': 'title', 'Plot': 'plot', 'Genre': 'genres'}) 

df2.head()

Unnamed: 0,title,genres,plot
0,Kansas Saloon Smashers,[unknown],"A bartender is working at a saloon, serving dr..."
1,Love by the Light of the Moon,[unknown],"The moon, painted with a smiling face hangs ov..."
2,The Martyred Presidents,[unknown],"The film, just over a minute long, is composed..."
3,"Terrible Teddy, the Grizzly King",[unknown],Lasting just 61 seconds and consisting of two ...
4,Jack and the Beanstalk,[unknown],The earliest known adaptation of the classic f...


In [8]:
check_dataset_issues(df2)

=== Dataset Quality Check ===

1. Missing Values (NaN/None):
✅ No missing values found.

2. 'unknown' or 'n/a' values in text columns:
⚠️ title: 18 instances of 'unknown/n/a'
⚠️ plot: 1335 instances of 'unknown/n/a'

3. Empty genre lists:
✅ No empty genre lists.

=== Check complete ===


#### Preprocessing of `movie_plots_dataset_by_sidhant_yadav.csv`

In [9]:
df3.shape

(92447, 3)

In [10]:
df3 = df3.drop(columns=["Plot"])

df3 = df3.rename(columns={'id': 'title', 'Name': 'plot'}) 

df3.head()

Unnamed: 0,title,plot
0,Kansas Saloon Smashers,"A bartender is working at a saloon, serving dr..."
1,Love by the Light of the Moon,"The moon, painted with a smiling face hangs ov..."
2,The Martyred Presidents,"The film, just over a minute long, is composed..."
3,"Terrible Teddy, the Grizzly King",Lasting just 61 seconds and consisting of two ...
4,Jack and the Beanstalk,The earliest known adaptation of the classic f...


In [11]:
check_dataset_issues(df3)

=== Dataset Quality Check ===

1. Missing Values (NaN/None):
Columns with missing values:
title    130
plot     184

2. 'unknown' or 'n/a' values in text columns:
⚠️ title: 24 instances of 'unknown/n/a'
⚠️ plot: 3732 instances of 'unknown/n/a'

=== Check complete ===


#### Preprocessing of `movie.csv`

In [12]:
df4.shape

(8980, 8)

In [13]:
df4 = df4[['title', 'overview']]

df4 = df4.rename(columns={'overview': 'plot'}) 

df4.head()

Unnamed: 0,title,plot
0,The Shawshank Redemption,Framed in the 1940s for the double murder of h...
1,The Godfather,"Spanning the years 1945 to 1955, a chronicle o..."
2,The Godfather Part II,In the continuing saga of the Corleone crime f...
3,Schindler's List,The true story of how businessman Oskar Schind...
4,12 Angry Men,The defense and the prosecution have rested an...


#### Preprocessing of `movies.csv`

In [15]:
df_large_genres = df_large_genres[['title', 'genres', 'overview']]

df_large_genres['genres'] = df_large_genres['genres'].str.lower().str.split('-')

df_large_genres = df_large_genres.rename(columns={'overview': 'plot'}) 

df_large_genres.head()

Unnamed: 0,title,genres,plot
0,Meg 2: The Trench,"[action, science fiction, horror]",An exploratory dive into the deepest depths of...
1,The Pope's Exorcist,"[horror, mystery, thriller]",Father Gabriele Amorth Chief Exorcist of the V...
2,Deadpool & Wolverine,"[action, comedy, science fiction]",A listless Wade Wilson toils away in civilian ...
3,Transformers: Rise of the Beasts,"[action, adventure, science fiction]",When a new threat capable of destroying the en...
4,Dune: Part Two,"[science fiction, adventure]",Follow the mythic journey of Paul Atreides as ...


#### Combining datasets

In [16]:
def clean_dataset(df):
    mask = (
        df['title'].isna() |
        df['plot'].isna() |
        df['title'].str.lower().str.contains('unknown|n/a', na=True) |
        df['plot'].str.lower().str.contains('unknown|n/a', na=True)
    )
    return df[~mask]

In [20]:
import numpy as np

def fill_empty_genres(x):
    if isinstance(x, (list, np.ndarray)):
        return x if len(x) > 0 else ['']
    elif pd.isna(x):
        return ['']
    else:
        return x

In [29]:
import pandas as pd

def process_and_combine_datasets(df_large, df1, df2, df3, df4):    
    print("Adding genres to third and forth datasets...")
    genre_mapping = df_large.set_index('title')['genres'].to_dict()
    
    df3['genres'] = df3['title'].map(genre_mapping)
    df3['genres'] = df3['genres'].apply(fill_empty_genres)
    
    df4['genres'] = df4['title'].map(genre_mapping)
    df4['genres'] = df4['genres'].apply(fill_empty_genres)
    
    print("Cleaning datasets...")
    df1 = clean_dataset(df1)
    df2 = clean_dataset(df2)
    df3 = clean_dataset(df3)
    df4 = clean_dataset(df4)
    
    print("Combining datasets...")
    combined = pd.concat([df1, df2, df3, df4], ignore_index=True)
    combined = combined.drop_duplicates(['title'])
    
    print(f"Final combined dataset shape: {combined.shape}")
    return combined

In [30]:
combined_df = process_and_combine_datasets(df_large_genres, df1, df2, df3, df4)

Adding genres to third and forth datasets...
Cleaning datasets...
Combining datasets...
Final combined dataset shape: (41151, 3)


In [31]:
check_dataset_issues(combined_df)

=== Dataset Quality Check ===

1. Missing Values (NaN/None):
✅ No missing values found.

2. 'unknown' or 'n/a' values in text columns:
✅ No 'unknown' or 'n/a' values found.

3. Empty genre lists:
✅ No empty genre lists.

=== Check complete ===


In [32]:
import heapq
import pandas as pd

def get_top_descriptions(main_df, exclude_df, n, batch_size=100000):
    exclude_titles = set(exclude_df['title'])
    
    counter = 0
    top_descriptions = []
    
    for i in range(0, len(main_df), batch_size):
        batch = main_df.iloc[i:i+batch_size]

        batch = clean_dataset(batch)
        
        filtered_batch = batch[~batch['title'].isin(exclude_titles)]
        
        for _, row in filtered_batch.iterrows():
            row_dict = row.to_dict()
            plot = row_dict.get('plot', '')
            
            length = len(str(plot))
            
            if len(top_descriptions) < n:
                heapq.heappush(top_descriptions, (length, counter, row_dict))
            else:
                heapq.heappushpop(top_descriptions, (length, counter, row_dict))
            counter += 1
    
    top_descriptions.sort(reverse=True, key=lambda x: x[0])
    
    result_df = pd.DataFrame([row_dict for (length, counter, row_dict) in top_descriptions])
    
    return result_df[main_df.columns]

In [33]:
print("Combining datasets...")
additional_df = get_top_descriptions(df_large_genres, combined_df, 50000-combined_df.shape[0])

final_df = pd.concat([combined_df, additional_df], ignore_index=True)

final_df['genres'] = final_df['genres'].apply(fill_empty_genres)

print(f"Final combined dataset shape: {final_df.shape}")

check_dataset_issues(final_df)

Combining datasets...
Final combined dataset shape: (50000, 3)
=== Dataset Quality Check ===

1. Missing Values (NaN/None):
✅ No missing values found.

2. 'unknown' or 'n/a' values in text columns:
✅ No 'unknown' or 'n/a' values found.

3. Empty genre lists:
✅ No empty genre lists.

=== Check complete ===


In [34]:
final_df.to_csv('final_dataset.csv', index=False)

In [None]:
final_df = pd.read_csv('final_dataset.csv')

final_df.head()

Unnamed: 0,title,plot,genres
0,Dungeons & Dragons: The Book of Vile Darkness,"Two thousand years ago, Nhagruul the Foul, a s...",['violence']
1,Scarface,"In May 1980, a Cuban man named Tony Montana (A...","['cruelty', 'murder', 'dramatic', 'cult', 'vio..."
2,A Single Man,George Falconer (Colin Firth) approaches a car...,"['romantic', 'queer', 'flashback']"
3,Baise-moi,Baise-moi tells the story of Nadine and Manu w...,"['gothic', 'cruelty', 'violence', 'cult', 'rev..."
4,Flightplan,Kyle Pratt (Jodie Foster) is a propulsion engi...,"['mystery', 'suspenseful', 'action', 'murder',..."
