# Recommendation System Using LLMs
## Custom Dataset Generation

---

## Initialization of IMBD Datasets

In [35]:
import pandas as pd

# Define the file paths for IMDB datasets
file_paths = {
    "DataSets/IMDB Datasets/title.akas.tsv": "akas_backup_df",
    "DataSets/IMDB Datasets/title.basics.tsv": "title_df",
    "DataSets/IMDB Datasets/title.episode.tsv": "episode_df",
    "DataSets/IMDB Datasets/title.ratings.tsv": "ratings_df"
}

# Importing the datasets into pandas DataFrames
for path, df_name in file_paths.items():
    globals()[df_name] = pd.read_csv(path, sep='\t', low_memory=False)
    print(f"Imported DataFrame: {df_name} from {path}")

print("Datasets Imported Successfully...")


Imported DataFrame: akas_backup_df from DataSets/IMDB Datasets/title.akas.tsv
Imported DataFrame: title_df from DataSets/IMDB Datasets/title.basics.tsv
Imported DataFrame: episode_df from DataSets/IMDB Datasets/title.episode.tsv
Imported DataFrame: ratings_df from DataSets/IMDB Datasets/title.ratings.tsv
Datasets Imported Successfully...


---


# Cleaning Datasets

In [36]:
# Prepare the akas DataFrame by making a copy for manipulation
akas_df = akas_backup_df.copy()

# Drop unnecessary columns from akas_df
akas_df.drop(columns=['ordering', 'title', 'attributes', 'isOriginalTitle'], inplace=True)

# Initialize a list of 'types' values to exclude
types_to_drop = ['alternative', 'festival', 'dvd', 'video', 'alternative\x02tv', 'festival\x02imdbDisplay',
                 'alternative\x02dvd', 'dvd\x02imdbDisplay', 'video\x02working', 'festival\x02working',
                 'dvd\x02video', 'alternative\x02festival', 'alternative\x02video', 'alternative\x02working']

# Exclude rows with specified 'types'
akas_df = akas_df[~akas_df['types'].isin(types_to_drop)]

# Sort akas_df by 'titleId' and 'types' with special handling for '\\N'
akas_df.sort_values(by=['titleId', 'types'], ascending=[True, False],
                    key=lambda col: col.replace('\\N', 'zz'), inplace=True)

# Drop duplicate 'titleId' entries, keeping the first occurrence
akas_df.drop_duplicates(subset='titleId', keep='first', inplace=True)

# Further clean up by removing no longer needed columns
akas_df.drop(columns=['types'], inplace=True)


In [37]:
# Extract unique 'tconst' identifiers from episode_df
tconst_list = episode_df['tconst'].unique().tolist()

# Remove rows in akas_df where 'titleId' matches any 'tconst' from the list
akas_df = akas_df[~akas_df['titleId'].isin(tconst_list)]

# Identify adult titles from title_df
adult_title_ids = title_df[title_df['isAdult'] == '1']['tconst'].tolist()

# Exclude adult titles from akas_df
akas_df = akas_df[~akas_df['titleId'].isin(adult_title_ids)]

# Rename 'titleId' column to 'tconst' for consistency
akas_df = akas_df.rename(columns={'titleId': 'tconst'})

# Prepare the title_df by dropping unnecessary columns
title_df.drop(columns=['runtimeMinutes', 'originalTitle', 'isAdult'], inplace=True)


In [38]:
# Merge akas_df with title_new_df and ratings_df to create a final DataFrame
df = akas_df.merge(title_df, on='tconst', how='left').merge(ratings_df, on='tconst')

# Initialize a list of 'titleType' values to exclude
titleType_to_drop = ['short', 'tvShort', 'tvMiniSeries', 'tvSpecial', 'video', 'videoGame']

# Exclude rows with specified 'titleType'
df = df[~df['titleType'].isin(titleType_to_drop)]

# Select specific columns for the final DataFrame
df = df[['tconst', 'titleType', 'primaryTitle', 'startYear', 'endYear', 'genres', 'language', 'region', 'averageRating', 'numVotes']]


---

In [39]:
# Save the final DataFrame to a CSV file
df.to_csv('DataSets/IMDB Datasets/Output/Custom_IMDB_Dataset.csv', index=False)


---

In [40]:
df.shape


(445067, 10)

In [41]:
df.head(20)


Unnamed: 0,tconst,titleType,primaryTitle,startYear,endYear,genres,language,region,averageRating,numVotes
8,tt0000009,movie,Miss Jerry,1894,\N,Romance,\N,DE,5.3,208
144,tt0000147,movie,The Corbett-Fitzsimmons Fight,1897,\N,"Documentary,News,Sport",\N,\N,5.2,505
337,tt0000502,movie,Bohemios,1905,\N,\N,\N,\N,4.1,15
369,tt0000574,movie,The Story of the Kelly Gang,1906,\N,"Action,Adventure,Biography",\N,\N,6.0,874
377,tt0000591,movie,The Prodigal Son,1907,\N,Drama,\N,US,5.5,23
391,tt0000615,movie,Robbery Under Arms,1907,\N,Drama,\N,\N,4.3,25
398,tt0000630,movie,Hamlet,1908,\N,Drama,\N,US,2.9,27
428,tt0000675,movie,Don Quijote,1908,\N,Drama,\N,\N,4.2,20
430,tt0000679,movie,The Fairylogue and Radio-Plays,1908,\N,"Adventure,Fantasy",\N,\N,5.0,70
538,tt0000862,movie,Faldgruben,1909,\N,\N,\N,\N,4.4,17
