# Correlation analysis - Movies Industry

#### Importing Libraries and setting

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib
import numpy as np

import matplotlib.pyplot as plt
from matplotlib.pyplot import plot
from matplotlib.pyplot import figure

plt.style.use('ggplot')
%matplotlib inline
matplotlib.rcParams['figure.figsize'] = (12,8)



#### Reading csv file and take a quick look into it

In [None]:
df = pd.read_csv('movies.csv')
df.head()


#### A general view

In [None]:
df.describe()

## The cleaning

#### Remove duplicates rows

In [None]:
df.drop_duplicates(inplace=True)

#### Seeing types

In [None]:
df.dtypes

#### Changing some types

In [None]:
df['budget'] = df['budget'].astype('Int64')
df['gross'] = df['gross'].astype('Int64')

#### Let's see NaN

In [None]:
df[df['budget'].isna()].count()

In [None]:
df.dropna(how='all')
for col in df.columns:
    pct_missing = np.mean(df[col].isnull())
    print('{} --- {} %'.format(col, pct_missing))

#### Released Dates Columns

In [None]:
df['released_date'] = pd.to_datetime(df['released'].str.split('(',expand=True)[0])
df['released_year'] = df['released_date'].dt.year
df['released_month'] = df['released_date'].dt.month

#### Country

In [None]:
df['country'] = df['released'].str.split('(',expand=True)[1]
df['country'] = df['country'].apply(lambda x: str(x).replace(")","").strip())

df.head(2)

#### Rate Revenue

In [None]:
df['PctRR'] = (df['gross'] / df['budget'] - 1) * 100
df['PctRR']

##### Drop Nan RR

In [None]:
ddf = df.dropna(subset=['budget','gross'], inplace=True)
print('Good!!!!!!!!!!!')
df.tail(4)


##### Just Seeing Bests RR 

In [None]:
df.sort_values(by=['PctRR'], inplace=False, ascending=False)

## Correlations

##### Guesses
- Gross vs Budget
- RR vs Budget
- Gross vs Time
- RR vs Time

#### Gross vs Budget

In [None]:
plt.scatter(x=df['budget'],y=df['gross'])
plt.title('Budget vs Gross Earnings')
plt.xlabel('Budget for Film')
plt.ylabel('Gross Earnings')

plt.show()

##### Plot with seaborn

In [None]:

df['gross'] = df['gross'].astype('float64')
df['budget'] = df['budget'].astype('float64')

df.dtypes


In [None]:
#sns.set_theme(color_codes=True)
from ctypes.wintypes import PINT


sns.lmplot(
    x='budget',
    y='gross',
    data=df,
    scatter_kws={'color':'red'},
    line_kws={'color':'blue'}
    )

In [None]:
corr_matrix = df.corr()
sns.heatmap(corr_matrix, annot=True)
plt.title('Correlation Matrix')
plt.xlabel('Movie feature')
plt.ylabel('Movie feature')
plt.show()

#### Let's see 'company'

In [None]:
df_numerized = df.copy()
for colname in df_numerized.columns:
    if(df_numerized[colname].dtype == 'object'):
        df_numerized[colname] = df_numerized[colname].astype('category')
        df_numerized[colname] = df_numerized[colname].cat.codes
    
df_numerized
        

In [None]:
corr_matrix = df_numerized.corr()
sns.heatmap(corr_matrix, annot=True)
plt.title('Correlation Matrix')
plt.xlabel('Movie feature')
plt.ylabel('Movie feature')
plt.show()

In [None]:
df_numerized.corr()

In [None]:
corr_matrix_num = df_numerized.corr()
corr_pairs = corr_matrix_num.unstack()
sorted_pairs = corr_pairs.sort_values()
high_corr = sorted_pairs[[(sorted_pairs) > 0.5], [(sorted_pairs) > 1]]
high_corr

##### Guesses
- Gross vs Budget âœ“
- RR vs Budget x
- Gross vs Time x
- RR vs Time x

## Votes and Budgets get the highest correlations with Gross Earnings