In [19]:
# Import necessary librairie for this purpose
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import seaborn as sns
import matplotlib.pyplot as plt
from plotly.subplots import make_subplots 


In [37]:
# Import speed dating dataset

df = pd.read_csv("src/Speed Dating Data.csv")

#This set_option is to display all columns in my notebook
pd.set_option('display.max_columns', None)

df.head()

## 1. Rapid visualisation of dataset

In [54]:
# rapid statistic description of all type of columns of the dataset
df.describe(include='all')

#### We can observe from `describe` function's statistics  that a large majority of the columns of this dataset has a numerical type data. 
#### This can be seen from `unique`, `top` and `freq` lines that are arguments for string type data, they contain all `NaN` values. 

#### This means that we will need to change all necessary columns into string types for our analyses like in dataset description.

## 2. Evaluating of missing values

#### Before analysing this dataset, we need to evaluate missing values in each columns. 

In [55]:
# Compute mean value of missing values of each column
# and sorted them by ascending order
df_nan = df.isna().mean().sort_values()

# Given that the futur plot of 'df_nan' will not display all column in x axis, 
# I print all indexes (columns of 'df') of 'df_nan' DataFrame to locate all not display column in the futur plot
for i in range(len(df_nan)):
    print(df_nan.index[i], end=',')

In [56]:
# I use a Bar plot from plotly.express to plot the Rate of missing values for each column
fig = px.bar(df_nan , labels = dict(index = "name of variables",value = "Rate of missing values"), width=1500, height=600)
fig.layout.showlegend = False

fig.show()

#### We ca observe there are a lot of column that have missing values represent more than half of their data. 
#### To analyse this dataset we need to take into account only columns that the missing values don't exced more that half of their data

## 3. Dealing with categorial variables

In [41]:
# Copy df dataframe in a different memory allocation
df_copy = df.copy()


# Dealing with categorial variables

df_copy.gender = df_copy.gender.apply(lambda x : 'Female' if x==0 else 'Male')

df_copy.condtn = df_copy.condtn.apply(lambda x : 'limited choice' if x==1 else 'extensive choice')

df_copy.match = df_copy.match.apply(lambda x : 'yes' if x==1 else 'no')

df_copy.dec = df_copy.dec.apply(lambda x : 'yes' if x==1 else 'no')

df_copy.dec_o = df_copy.dec_o.apply(lambda x : 'yes' if x==1 else 'no')

df_copy.samerace = df_copy.samerace.apply(lambda x : 'yes' if x==1 else 'no')

df_copy.length = df_copy.length.apply(lambda x : 'Too little' if x==1 else 'Too much' if x==2 else 'Just Right')

df_copy.numdat_2 = df_copy.numdat_2.apply(lambda x : 'Too few' if x==1 else 'Too many' if x==2 else 'Just Right')

df_copy.income = df_copy['income'].apply(lambda x: float(((str(x).replace('.','')).replace(',','.')).replace(',','.')))

df_copy.race_o = df_copy.race_o.apply(lambda x : 'Black/African American' if x==1 
                                                else 'European/Caucasian-American' if x==2
                                                else 'Latino/Hispanic American' if x==3
                                                else 'Asian/Pacific Islander/Asian-American' if x==4
                                                else 'Native American' if x==5
                                                else 'Other')


df_copy.race = df_copy.race.apply(lambda x : 'Black/African American' if x==1 
                                                else 'European/Caucasian-American' if x==2
                                                else 'Latino/Hispanic American' if x==3
                                                else 'Asian/Pacific Islander/Asian-American' if x==4
                                                else 'Native American' if x==5
                                                else 'Other')


df_copy.goal = df_copy.goal.apply(lambda x : 'Seemed like a fun night out' if x==1 
                                                else 'To meet new people' if x==2
                                                else 'To get a date' if x==3
                                                else 'Looking for a serious relationship' if x==4
                                                else 'To say I did it' if x==5
                                                else 'Other')


df_copy.date = df_copy.date.apply(lambda x : 'Several times a week' if x==1 
                                                else 'Twice a week' if x==2
                                                else 'Once a week' if x==3
                                                else 'Twice a month' if x==4
                                                else 'Once a month' if x==5
                                                else 'Several times a year' if x==6
                                                else 'Other')

df_copy.go_out = df_copy.go_out.apply(lambda x : 'Several times a week' if x==1 
                                                else 'Twice a week' if x==2
                                                else 'Once a week' if x==3
                                                else 'Twice a month' if x==4
                                                else 'Once a month' if x==5
                                                else 'Several times a year' if x==6
                                                else 'Other')


df_copy.field_cd = df_copy.field_cd.apply(lambda x : 'Law' if x==1 
                                                else 'Math' if x==2
                                                else 'Social Science, Psychologist ' if x==3
                                                else 'Medical Science, Pharmaceuticals, and Bio Tech ' if x==4
                                                else 'Engineering ' if x==5
                                                else 'English/Creative Writing/ Journalism ' if x==6
                                                else 'History/Religion/Philosophy ' if x==7
                                                else 'Business/Econ/Finance ' if x==8
                                                else 'Education, Academia ' if x==9
                                                else 'Biological Sciences/Chemistry/Physics' if x==10
                                                else 'Social Work ' if x==11
                                                else 'Undergrad/undecided ' if x==12
                                                else 'Political Science/International Affairs ' if x==13
                                                else 'Film' if x==14
                                                else 'Fine Arts/Arts Administration' if x==15
                                                else 'Languages' if x==16
                                                else 'Architecture' if x==17                                        
                                                else 'Other')



df_copy.career_c= df_copy.career_c.apply(lambda x : 'Lawer' if x==1 
                                                else 'Academic/Research ' if x==2
                                                else 'Psychologist ' if x==3
                                                else 'Doctor/Medicine ' if x==4
                                                else 'Engineer' if x==5
                                                else 'Creative Arts/Entertainment ' if x==6
                                                else 'Banking/Consulting/Finance/Marketing/Business/CEO/Entrepreneur/Admin ' if x==7
                                                else 'Real Estate ' if x==8
                                                else 'International/Humanitarian Affairs  ' if x==9
                                                else 'Undecided' if x==10
                                                else 'Social Work ' if x==11
                                                else 'Speech Pathology ' if x==12
                                                else 'Politics' if x==13
                                                else 'Pro sports/Athletics' if x==14
                                                else 'Other' if x==15
                                                else 'Journalism' if x==16
                                                else 'Architecture' )

                                           

## 4. Some distributions analysis

#### To do this, we will write a function that will return all distributions of a set of variables we choose

In [42]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots

def repartition_multi_plot(df, cols, titles, cat):
    '''
    This function returns all distributions of a set of chosen variables and display one of them 
    when we select it by clicking on a button.

    Parameters:
    df : pandas DataFrame
        The DataFrame that we want to analyse distribution of its columns.
    cols : list
        Set of chosen columns.
    titles : list
        Correspondent titles of each column's distributions.
    cat : str
        Categorical variable on which we want to divide the DataFrame.

    Returns:
    None
    '''
    
    # create figure
    fig = make_subplots()

    # Division of DataFrame 'df' in two categories cat1 and cat2
    cat1, cat2 = df.groupby(cat)
    
    # For each column, add a new trace
    for col in cols:
        
        # The first trace will be visible at the beginning
        if col == cols[0]:

            # add first trace for cat1
            fig.add_trace(
                go.Histogram(
                    x = cat1[1][col],
                    nbinsx = cat1[1][col].nunique(),
                    name = cat1[0],
                    marker_color='#EB89B5',
                    visible = True
                )
            )

            # add first trace for cat2
            fig.add_trace(
                go.Histogram(
                    x = cat2[1][col],
                    nbinsx = cat2[1][col].nunique(),
                    name = cat2[0],
                    marker_color='#330C73',
                    visible = True
                )
            )
        
        # The next traces won't be visible at the beginning
        else:

            # add next traces for cat1
            fig.add_trace(
                go.Histogram(
                    y = cat1[1][col],
                    nbinsy = cat1[1][col].nunique(),
                    name = cat1[0],
                    marker_color='#EB89B5',
                    visible = False
                )
            )

            # add next traces for cat2
            fig.add_trace(
                go.Histogram(
                    y = cat2[1][col],
                    nbinsy = cat2[1][col].nunique(),
                    name = cat2[0],
                    marker_color='#330C73',
                    visible = False
                )
            )

    # Creation of function that customize our button
    def create_layout_button(col, title):
        '''
        This function creates a layout dropdown button that allows us to select the title of each distribution.

        Parameters:
        col : str
            The column selected.
        title : str
            Correspondent title of column's distribution.

        Returns:
        A dictionary representing the value of button argument in go.layout.Updatemenu().
        '''
        # Like with each column we have two plots, finally the number of plots will be two times the number of columns.

        # Create an empty list that will contain column names grouped and repeated two by two. 
        # So we will have double_cols = [col_1, col_1, col_2, col2_, col_3, col_3 ..., col_n, col_n]
        double_cols = []
        for el1, el2 in zip(cols, cols):
            double_cols.append(el1)
            double_cols.append(el2)

        # This function returns the value of button argument in go.layout.Updatemenu()
        return dict(
            label = title,
            method = 'update',
            args = [{'visible': [col_i == col for col_i in double_cols]}])

    
    # Update layout properties
    fig.update_layout(
        updatemenus=[go.layout.Updatemenu(
            active = 0,
            buttons = [create_layout_button(col, title) for col, title in zip(cols, titles)],
            x = 0,
            xanchor = 'left',
            y = 1.2,
            yanchor = 'top',
            
            )
        ],
        legend_title_text='Gender',
        autosize = False,
        width = 800,
        height = 800)
    
    fig.show()


In [43]:
# Selection of some columns to anylse their distribution
repartition_cols = ['age',            
            'race',
            'date',
            'goal',
            'go_out',
            'field_cd',
            'career_c']



# Correspondant title of selected columns
repartition_titles = ['Age repartition',                       
            'Race repartition',
            'Date repartition',
            'Goal repartition',
            'Go out repartition',
            'Study fields repartition', 
            'Careers repartition']


arg = [df_copy, repartition_cols, repartition_titles, 'gender']
repartition_multi_plot(*arg)

### 4-1- Age repartition analysis

In [44]:
display(df_copy[['gender', 'age']].groupby('gender').agg(['count', 'mean', 'median', 'min', 'max','std']))


#### People who participated to this Speed Dating are essentially young people with  mean age is about 26 years old for both female and male. 
#### Most of people are aged between 23 and 29 years old. minimum age is 18 and maximum age is 55

## 4-2- Race repartition analysis

In [45]:
female, male = df_copy.groupby('gender')

race_ = pd.concat([female[1]['race'].describe(), male[1]['race'].describe()], ignore_index=True, axis= 1)

race_.columns = ['female', 'male']
race_

#### For race repartition, we can observe that the majority of participant is `European/Caucasian-American`. 
#### They alone represent more than half (52%) of the participants followed by the `Asian/Pacific-Islander/Asian-American` group (25%).
#### `Latino/Hispanic-American` and `Black/African-American` groups only represent respectively 7% and 5% of participants.
#### The race repartition does not represent that of American population in view of the low proportion of minorities `Latino/Hispanic-American` and `Black/African-American`
#### and also i view of the number of population of group `Asian/Pacific-Islander/Asian-American` which is largely overrepresented. 
#### Analysis with this repartition could be not reliable.

### 4-3- Date repartition analysis

In [46]:
female, male = df_copy.groupby('gender')

date_ = pd.concat([female[1]['date'].describe(), male[1]['date'].describe()], ignore_index=True, axis= 1)

date_.columns = ['female', 'male']
date_

#### We can observe that participants are people who get date with a variety of frequencies during year.
#### But these frequencies are dominates by those who get date `Several times a year` for female and by `Twice a month` for male.
#### So there is a strong representation of male who get a lot of date during a year.

### 4-4- Goal repartition analysis

In [47]:
female, male = df_copy.groupby('gender')

goal_ = pd.concat([female[1]['goal'].describe(), male[1]['goal'].describe()], ignore_index=True, axis= 1)

goal_.columns = ['female', 'male']
goal_

#### Populations in this dataset are strongly represented by people that their primary goal in participating in this event is to have fun

### 4-5- Other repartition analysis

In [48]:
female, male = df_copy.groupby('gender')

go_out_ = pd.concat([female[1]['go_out'].describe(), male[1]['go_out'].describe()], ignore_index=True, axis= 1)

go_out_.columns = ['female', 'male']
print('Go out frequency : \n', go_out_, '\n')



field_ = pd.concat([female[1]['field_cd'].describe(), male[1]['field_cd'].describe()], ignore_index=True, axis= 1)

field_.columns = ['female', 'male']
print('Study field : \n', field_, '\n')




career_ = pd.concat([female[1]['career_c'].describe(), male[1]['career_c'].describe()], ignore_index=True, axis= 1)

career_.columns = ['female', 'male']
print('Career : \n', career_)

#### Participants are represented majoritary by people who go out a lot (`Twice a week`) and by women who have studied academically and by men who have studied business, economy and finance.
#### In this population, women have mostly pursued careers in academic research and men have mostly pursued careers in finance sector.

## 5 -  Correlation between some attributes and decision

In [49]:
df.income = df['income'].apply(lambda x: float(((str(x).replace('.','')).replace(',','.')).replace(',','.')))
corr_columns = [
'dec', 'attr', 'sinc', 'intel', 'fun', 'amb', 'shar', 'like', 'prob',
'imprace', 'imprelig', 'age_o', 'race_o', 'samerace', 'income'
]


fig, ax = plt.subplots(figsize=(18,10)) 
sns.heatmap (df[corr_columns].corr(), annot=True, cmap = 'YlGnBu', linewidths=3, square=True, ax=ax)
ax.set_title('Correlation between attributes and decision')

#### We can observe that religion, race, age or income of partner don't have any importance on decision. 
#### But some partner attributes influence more or less decision. Let's study which of them.

In [50]:
fig, axes = plt.subplots(figsize=(21, 24), nrows=3, ncols=3)

# Study of attractiveness attribute influence on decision
sns.boxplot(x='dec', y='attr', data=df_copy, hue='gender', ax=axes[0,0]).set(title='Attractiveness', xlabel='Decision', ylabel='Rate of attractiveness')

# Study of sincerity attribute influence on decision
sns.boxplot(x='dec', y='sinc', data=df_copy, hue='gender', ax=axes[0,1]).set(title='Sincerity', xlabel='Decision', ylabel='Rate of sincerity')

# Study of Intelligence attribute influence on decision
sns.boxplot(x='dec', y='intel', data=df_copy, hue='gender', ax=axes[0,2]).set(title='Intelligence', xlabel='Decision', ylabel='Rate of Intelligence')



# Study of Fun attribute influence on decision
sns.boxplot(x='dec', y='fun', data=df_copy, hue='gender', ax=axes[1,0]).set(title='Fun', xlabel='Decision', ylabel='Rate of Fun')

# Study of Ambitious attribute influence on decision
sns.boxplot(x='dec', y='amb', data=df_copy, hue='gender', ax=axes[1,1]).set(title='Ambition', xlabel='Decision', ylabel='Rate of Ambition')

# Study of Shared Interest attribute influence on decision
sns.boxplot(x='dec', y='shar', data=df_copy, hue='gender', ax=axes[1,2]).set(title='Shared Interest', xlabel='Decision', ylabel='Rate of Shared Interest')



# Study of Like partner attribute influence on decision
sns.boxplot(x='dec', y='like', data=df_copy, hue='gender', ax=axes[2,0]).set(title='Like partner', xlabel='Decision', ylabel='Rate of Like partner')

# Study of Yes probability attribute influence on decision
sns.boxplot(x='dec', y='prob', data=df_copy, hue='gender', ax=axes[2,1]).set(title='Yes probability', xlabel='Decision', ylabel='Rate of Yes probability')

# Study of Importance of race attribute influence on decision
sns.boxplot(x='dec', y='imprace', data=df_copy, hue='gender', ax=axes[2,2]).set(title='Importance of race', xlabel='Decision', ylabel='Rate of Importance of race')

plt.suptitle('Importance of partner attributes on the decision', fontsize=32)
plt.show()

#### We can observe that when the decision is `Yes`, for male the least important attribute is `share interest`. 
#### This is not the case for women. However when decision is `Yes` for women the most important attributes are `sincerity` and `intelligence` 
#### Generally when they like partner they say to have a second date with him. And generally their prediction of having a second date turns out to be correct. 
#### Not surprisingly race does not have great importance in decision.

#### For next we will study the reel impact of what people look for in the opposite sex on the decision

#### Like `waves` don't have the same preference scale we have to normalize all rates.

In [51]:
# Selection of attribute of what people look for in the opposite sex
attr1 = ['attr1_1', 'sinc1_1', 'intel1_1', 'fun1_1', 'amb1_1', 'shar1_1']

# Copy df_copy dataframe in a different memory allocation
df_copy_attr1 = df_copy.copy()

for col in attr1:
    # max value of each column
    max_col = df_copy_attr1[col].max()
    # fill nan value by 0 and divide each column value by the max value
    df_copy_attr1[col] = df_copy_attr1[col].fillna(0).apply(lambda x : x/max_col)

In [52]:
fig, axes = plt.subplots(figsize=(21, 16), nrows=2, ncols=3)

# Study of attractiveness attribute influence on decision
sns.violinplot(x='dec', y='attr1_1', data=df_copy_attr1, hue='gender', ax=axes[0,0]).set(title='Attractiveness', xlabel='Decision', ylabel='Normalized rate of attractiveness')

# Study of sincerity attribute influence on decision
sns.violinplot(x='dec', y='sinc1_1', data=df_copy_attr1, hue='gender', ax=axes[0,1]).set(title='Sincerity', xlabel='Decision', ylabel='Normalized rate of sincerity')

# Study of Intelligence attribute influence on decision
sns.violinplot(x='dec', y='intel1_1', data=df_copy_attr1, hue='gender', ax=axes[0,2]).set(title='Intelligence', xlabel='Decision', ylabel='Normalized rate of Intelligence')



# Study of Fun attribute influence on decision
sns.violinplot(x='dec', y='fun1_1', data=df_copy_attr1, hue='gender', ax=axes[1,0]).set(title='Fun', xlabel='Decision', ylabel='Normalized rate of Fun')

# Study of Ambitious attribute influence on decision
sns.violinplot(x='dec', y='amb1_1', data=df_copy_attr1, hue='gender', ax=axes[1,1]).set(title='Ambition', xlabel='Decision', ylabel='Normalized rate of Ambition')

# Study of Shared Interest attribute influence on decision
sns.violinplot(x='dec', y='shar1_1', data=df_copy_attr1, hue='gender', ax=axes[1,2]).set(title='Shared Interest', xlabel='Decision', ylabel='Normalized rate of Shared Interest')


plt.suptitle('Importance of \'what people look for in the opposite sex\' attributes on the decision', fontsize=25)
plt.show()

#### This plot show that what people less look for in the opposite sex are `attractiveness` and `ambition`. 
#### This differs from what actually drove them to make the `Yes`decision. In this case `attractiveness` and `ambition` attributes had quite a good score.
#### We can also observe that what people more look for in the opposite sex is `share interest` what is really the case.

#### `We can conclude that it is very difficult for people to accurately predict their own perceived value in the dating market.` 

## 6 - Study of mean decision with respect of \'speed date\' order of the night

In [53]:
fig, axes = plt.subplots(ncols=2, figsize=(12,5))

sns.lineplot(x="order", y="dec", hue="gender", data=df_copy,ax=axes[0]).set(title='Decision mean with respect of \'speed date\' order of the night\n')
sns.lineplot(x="order", y="dec", hue="gender", data=df_copy,ax=axes[1]).set(title='Same plot zoomed \n')

axes[0].set_yticks(ticks=[df_copy["dec"].unique()[0], df_copy["dec"].unique()[1]])

plt.show()

#### Generally we observe that `Yes` decicion decrease during the night. 
#### However women tend to be pickier than men as the evening progresses.
#### `So in terms of getting a second date, is it better to be someone's first speed date of the night.`
