## Analyse the relationship between movie genres and their box office performance to recommend which genres the studio should focus on producing.
### Data sources: IMDB (genres), Box Office Mojo  bom (gross revenue).

In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import sqlite3 as sq3

In [2]:
# replace the file path with the appropriate directory

#### 1.first we are gonna load the 2 data sources

In [3]:
# Box office mojo (bom)
bom=pd.read_csv(r"C:\Users\josep\OneDrive\Documents\Data science\School projects\film-success-analysis\zippedData\bom.movie_gross.csv.gz")
bom

Unnamed: 0,title,studio,domestic_gross,foreign_gross,year
0,Toy Story 3,BV,415000000.0,652000000,2010
1,Alice in Wonderland (2010),BV,334200000.0,691300000,2010
2,Harry Potter and the Deathly Hallows Part 1,WB,296000000.0,664300000,2010
3,Inception,WB,292600000.0,535700000,2010
4,Shrek Forever After,P/DW,238700000.0,513900000,2010
...,...,...,...,...,...
3382,The Quake,Magn.,6200.0,,2018
3383,Edward II (2018 re-release),FM,4800.0,,2018
3384,El Pacto,Sony,2500.0,,2018
3385,The Swan,Synergetic,2400.0,,2018


In [4]:
# IMDB
conn=sq3.connect(r"C:\Users\josep\Downloads\Compressed\im.db\im.db")

In [5]:
# first we are gonna load the column that has the genres
imdb = pd.read_sql("""select *
                    from movie_basics;
                   """,conn)
imdb

Unnamed: 0,movie_id,primary_title,original_title,start_year,runtime_minutes,genres
0,tt0063540,Sunghursh,Sunghursh,2013,175.0,"Action,Crime,Drama"
1,tt0066787,One Day Before the Rainy Season,Ashad Ka Ek Din,2019,114.0,"Biography,Drama"
2,tt0069049,The Other Side of the Wind,The Other Side of the Wind,2018,122.0,Drama
3,tt0069204,Sabse Bada Sukh,Sabse Bada Sukh,2018,,"Comedy,Drama"
4,tt0100275,The Wandering Soap Opera,La Telenovela Errante,2017,80.0,"Comedy,Drama,Fantasy"
...,...,...,...,...,...,...
146139,tt9916538,Kuambil Lagi Hatiku,Kuambil Lagi Hatiku,2019,123.0,Drama
146140,tt9916622,Rodolpho Teóphilo - O Legado de um Pioneiro,Rodolpho Teóphilo - O Legado de um Pioneiro,2015,,Documentary
146141,tt9916706,Dankyavar Danka,Dankyavar Danka,2013,,Comedy
146142,tt9916730,6 Gunn,6 Gunn,2017,116.0,


### 2.Data cleaning 

#### 2.1 bom

In [6]:
# remove the null values
# Replace any non-numeric values with NaN 
bom['foreign_gross'] = pd.to_numeric(bom['foreign_gross'], errors='coerce')
bom.isnull().sum()

title                0
studio               5
domestic_gross      28
foreign_gross     1355
year                 0
dtype: int64

In [7]:
# since the null values are less we are going to tranculate them with the mean
# Calculate the mean 
mean_value = bom['foreign_gross'].mean()
mean_value1=bom['domestic_gross'].mean()
# Replace NaN values with the mean
bom['foreign_gross'].fillna(mean_value, inplace=True)
bom['domestic_gross'].fillna(mean_value1, inplace=True)
bom.isnull().sum()

title             0
studio            5
domestic_gross    0
foreign_gross     0
year              0
dtype: int64

In [8]:
# we are going to create a new collum that will have the gross revenue
# then drop any unecesary columns
bom['Gross_Revenue'] =bom['domestic_gross'] + bom['foreign_gross']
bom['Gross_Revenue in millions']=bom['Gross_Revenue'] / 1000000
bom = bom.drop(columns=["studio", "year","domestic_gross","foreign_gross","Gross_Revenue"])
bom

Unnamed: 0,title,Gross_Revenue in millions
0,Toy Story 3,1067.000000
1,Alice in Wonderland (2010),1025.500000
2,Harry Potter and the Deathly Hallows Part 1,960.300000
3,Inception,828.300000
4,Shrek Forever After,752.600000
...,...,...
3382,The Quake,75.063242
3383,Edward II (2018 re-release),75.061842
3384,El Pacto,75.059542
3385,The Swan,75.059442


#### 2.2 IMDB

In [9]:
# drop the rows with null values and the unrelated collumns
imdb = imdb.drop(columns=["movie_id","start_year","runtime_minutes"])
imdb

Unnamed: 0,primary_title,original_title,genres
0,Sunghursh,Sunghursh,"Action,Crime,Drama"
1,One Day Before the Rainy Season,Ashad Ka Ek Din,"Biography,Drama"
2,The Other Side of the Wind,The Other Side of the Wind,Drama
3,Sabse Bada Sukh,Sabse Bada Sukh,"Comedy,Drama"
4,The Wandering Soap Opera,La Telenovela Errante,"Comedy,Drama,Fantasy"
...,...,...,...
146139,Kuambil Lagi Hatiku,Kuambil Lagi Hatiku,Drama
146140,Rodolpho Teóphilo - O Legado de um Pioneiro,Rodolpho Teóphilo - O Legado de um Pioneiro,Documentary
146141,Dankyavar Danka,Dankyavar Danka,Comedy
146142,6 Gunn,6 Gunn,


In [10]:
# checking the null values
imdb.isna().sum()

primary_title        0
original_title      21
genres            5408
dtype: int64

### 3.Looking if there similarities in the 2 data sets in order to group them into geners

##### Compare the title column in the BOM DataFrame with the primary_title and original_title columns in the IMDB DataFrame to find similar titles.

##### Create a New Column in the BOM DataFrame that appends the genres from the IMDB DataFrame to the rows in the BOM DataFrame where titles are similar.

In [11]:
# pip install pandas fuzzywuzzy

In [12]:
import pandas as pd
from fuzzywuzzy import process


In [None]:
# convert the column original title to list
imdb_titles = imdb['original_title'].dropna().tolist()


In [14]:
# create a fuction thats is gona compare each individual row in bom to imdb
def get_best_match_genres(title, choices, scorer, cutoff=90):
    best_match = process.extractOne(title, choices, scorer=scorer)
    if best_match and best_match[1] >= cutoff:
        return imdb.loc[imdb['original_title'] == best_match[0], 'genres'].values[0]
    return None


In [15]:
bom['genres'] = bom['title'].apply(lambda x: get_best_match_genres(x, imdb_titles, process.fuzz.ratio))


: 

: 

##### got stranded trying to find similarities between the bom[title] and imbp [primary title ] in order to create a new column in the bom title wich ill append the corespondind title