In [None]:
# Importing dependencies
import pandas as pd
import numpy as np
import datetime
import findspark
findspark.init()
from pyspark.sql import SparkSession
import time
import gender_guesser.detector as gender
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
#Reading data from csv
df = pd.read_csv('./Resources/names_cleaned_data.csv')
df.head()

In [None]:
# Cleaning up show number, dropping dt_indx
df['show_number'] = df['archive_info'].str.replace(r'\D','')
df = df.drop(['archive_info', 'dt_indx'], axis = 1)
df.head()

In [None]:
#Creating First Name and Last Name columns from full name
df[['First_Name', 'Last_Name','X']]= df.full_name.str.split(' ', expand=True)
df.head() 


In [None]:
#Re-arranging columns
df_ref = df[['full_name','First_Name','Last_Name','occupation','city','state','show_number','date','winner_flg','final_score']]
df_ref

In [None]:
# Using gender detector to classify contestants into male and female using their first name
gd = gender.Detector()
df_ref['Gender'] = df_ref['First_Name'].apply(gd.get_gender)
df_ref.head()

In [None]:
# Adding mostly male to male and mostly female to female based on visual inspection
df2 = df_ref.replace('mostly_male', 'male')
df3 = df2.replace('mostly_female', 'female')

In [None]:
#Checking for total numbers - especially unknown and andy
print(df3['Gender'].value_counts()['male'])
print(df3['Gender'].value_counts()['female'])
print(df3['Gender'].value_counts()['unknown'])
print(df3['Gender'].value_counts()['andy'])



In [None]:
#Dropping rows classified as unknown gender
df4 = df3[(df3['Gender'] == 'unknown')].index
df3.drop(df4, inplace = True)
len(df3)


In [None]:
#Dropping rows classified as andy gender
df5 = df3[(df3['Gender'] == 'andy')].index
df3.drop(df5, inplace = True)
len(df3)

In [None]:
#Final dataframe
df3.head()

In [None]:
print(df3.dtypes)

In [None]:
# Changing date format to YYYY-MM-DD
df3['date'] = pd.to_datetime(df3['date'])


In [None]:
#Extracting year from date and adding into a new Year column
df3['Year'] = df3['date'].dt.strftime('%Y')


In [None]:
# We only have 3 rows from 2012. Dropping all 3
df6 = df3[(df3['Year'] == '2012')].index
df3.drop(df6, inplace = True)
df3.head()

In [None]:
#Creating flag_female and flag_male columns from Gender column
df4 = df3.join(df3['Gender'].str.split().explode().reset_index().assign(count=1)
               .pivot_table('count','index','Gender', fill_value=0)
               .add_prefix('flag_'))
df4

In [None]:
#Total number of conyestants(individual)
df4['full_name'].nunique()

In [None]:
once_df = df4.drop_duplicates(subset='full_name', keep='first')
once_df.head()

In [None]:
len(once_df)

In [None]:
men_df = once_df.loc[once_df['Gender'] == 'male']
len(men_df)


In [None]:
women_df = once_df.loc[once_df['Gender'] == 'female']
len(women_df)


In [None]:
gen_pie = pd.DataFrame({'Gender': ['Men','Women'],
                        'Count': [len(men_df), len(women_df)]},
                      index=['Men','Women'])

gen_pie.plot.pie(y='Count', title='Participants')


In [None]:
#Storing away a copy of the dataframe
jeo_df = df4.copy()

In [None]:
df4.to_csv('myjeopardy.csv')

In [None]:
# Starting a spark session
spark = SparkSession.builder.appName("PandasToSparkDF").getOrCreate()
df_spark = spark.createDataFrame(df4)
df_spark.show()

In [None]:
# Creating a temporary view
df_spark.createOrReplaceTempView('gen_comp')
gen_num = spark.sql("""
SELECT COUNT(full_name) AS Contestants, Gender
FROM gen_comp
GROUP BY Gender;
""").toPandas()
gen_num

In [None]:
gen_num.plot(kind='bar')

In [None]:
#Plotting bar graph of female vs male contestants
plt.figure(figsize=(8,6))
plots = sns.barplot(x='Gender',y='Contestants',data=gen_num)
plt.xlabel('Gender', size=15)
plt.ylabel('Number of Games', size=15)
plt.title('Number of games played (from 2014 to 2019)')
plt.show()

In [None]:
# Querying number of male contestants for each year
male_year_df = spark.sql("""
SELECT Year, COUNT(flag_male) AS Male
FROM gen_comp
WHERE flag_male ==1
GROUP BY Year
ORDER BY Year ASC;
""").toPandas()
male_year_df

In [None]:
# Querying number of female contestants for each year
female_year_df = spark.sql("""
SELECT Year, COUNT(flag_female) AS Female
FROM gen_comp
WHERE flag_female ==1
GROUP BY Year
ORDER BY Year ASC;
""").toPandas()
female_year_df

In [None]:
# Comparing male vs female contestants each year
year_df = pd.merge(male_year_df, female_year_df, on = ['Year'])
year_df

In [None]:
# Plotting male vs female contestant participang each year
year_df.plot(x='Year', y=['Male', 'Female'], kind='bar')
plt.title('Men vs Women Contestants')
plt.ylabel('Number of Games played')


In [None]:
year_df.plot(x='Year')
plt.xlabel('Year')
plt.ylabel('Number of Games played')
plt.title('Men vs Women Contestants')

In [None]:
# Querying total winners by gender
win_df = spark.sql("""
SELECT COUNT(full_name) AS Winners, Gender
FROM gen_comp
WHERE winner_flg ==1
GROUP BY Gender

""").toPandas()
win_df

In [None]:
# Plotting male vs female winners
Gender = win_df['Gender']
Winners = win_df['Winners']
fig = plt.figure(figsize = (8,5))
plt.bar(Gender,Winners, width = 0.5)
plt.title('Winners (from 2014 to 2019)')
plt.ylabel('Number of games won')
plt.show()

In [None]:
#Plotting male vs female winners total
def addlabels(x,y):
    for i in range(len(x)):
        plt.text(i,y[i],y[i])
        
x = win_df['Gender']
y = win_df['Winners']
fig = plt.figure(figsize = (10,7))
plt.bar(x,y, width = 0.5)
addlabels(x,y)
plt.title('Winners (from 2014 to 2019)')
plt.xlabel('Gender')
plt.ylabel('Number of Games won')
plt.show()

In [None]:
#Querying number of female winners each year
win_female_year_df = spark.sql("""
SELECT Year, COUNT(flag_female) AS Female
FROM gen_comp
WHERE flag_female ==1 AND winner_flg ==1
GROUP BY Year
ORDER BY Year ASC;
""").toPandas()
win_female_year_df

In [None]:
#Querying number of male winners each year
win_male_year_df = spark.sql("""
SELECT Year, COUNT(flag_male) AS Male
FROM gen_comp
WHERE flag_male ==1 AND winner_flg ==1
GROUP BY Year
ORDER BY Year ASC;
""").toPandas()
win_male_year_df

In [None]:
low_score_df = spark.sql("""
SELECT full_name AS Name, SUM(final_score) AS Score
FROM gen_comp
GROUP BY full_name
ORDER BY Score ASC;
""").toPandas()
low_sc_df=low_score_df.iloc[:10]
low_sc_df

In [None]:
low_sc_df.plot(kind='barh')

In [None]:
high_score_df = spark.sql("""
SELECT full_name AS Name, SUM(final_score) AS Score
FROM gen_comp
GROUP BY full_name
ORDER BY Score DESC;
""").toPandas()
high_sc_df= high_score_df.iloc[:10]
high_sc_df

In [None]:
high_sc_df.plot(kind='barh')
plt.title('Highest Scorers')
plt.xlabel('Points scored')
plt.ylabel('Contestants')

In [None]:
win_year_df = pd.merge(win_female_year_df, win_male_year_df,on='Year')
win_year_df

In [None]:
win_year_df.plot(x='Year', y=['Male', 'Female'], kind='bar')
plt.title('Men vs Women Winners')
plt.ylabel('Number of Winners')

In [None]:
win_year_df.plot(x='Year')
plt.xlabel('Year')
plt.ylabel('Number of Winners')
plt.title('Men vs Women Winners')

In [None]:
print(df4.First_Name.value_counts())

In [None]:
df5 = df4['full_name'].unique()
names_df = pd.DataFrame(df5, columns=['Names'])
names_df

In [None]:
names_df[['First_Name', 'Last_Name','X']]= names_df.Names.str.split(' ', expand=True)
names_df

In [None]:
pop_names = names_df['First_Name'].value_counts()
pop_names_df = pop_names.to_frame()
pop_names_df.reset_index(inplace=True)
pop_names_df = pop_names_df.rename(columns={'index':'Contestant First_name', 
                                            'First_Name': 'Occurences'})
pop_names_df.head(10)


In [None]:
winners_df = df4.loc[df4['winner_flg']==1]
winners_df

In [None]:
win_names = winners_df['full_name'].unique()
win_names_df = pd.DataFrame(win_names, columns=['Winames'])
win_names_df[['First_Name', 'Last_Name','X']]= win_names_df.Winames.str.split(' ', expand=True)
win_names_df


In [None]:
pop_winner_names = win_names_df['First_Name'].value_counts()
pop_winner_names_df = pop_winner_names.to_frame()
pop_winner_names_df.reset_index(inplace=True)
pop_winner_names_df = pop_winner_names_df.rename(columns={'index':'Winner First_name', 
                                            'First_Name': 'Occurences'})
pop_winner_names_df.head(10)

In [None]:
pop_win = pop_winner_names_df.iloc[:10]
#pop_win
#pop_win.plot(kind='pie', y = 'Occurences')
pop_win.style.background_gradient(cmap='viridis')\
                              .set_properties(**{'font-size':'20px'})

In [None]:
pop_winner_last_names = win_names_df['Last_Name'].value_counts()
pop_winner_last_names_df = pop_winner_last_names.to_frame()
pop_winner_last_names_df.reset_index(inplace=True)
pop_winner_last_names_df = pop_winner_last_names_df.rename(columns={'index':'Winner Last_name', 
                                            'Last_Name': 'Occurences'})
pop_win_last=pop_winner_last_names_df.iloc[:10]
pop_win_last.style.background_gradient(cmap='viridis')\
                              .set_properties(**{'font-size':'20px'})

In [None]:
jeo_df.head()

In [None]:
jeo_df. to_csv('final_gender_jeop.csv')