# Bivariate Data Analysis

In [None]:
# libraries
import os
import sys
import inspect
sys.path.append("../src")

import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

from shared import directories
from shared import filenames
from shared import variables
sys.path.append(directories.ANALYSIS_PATH)

import bivariate, independence

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [None]:
class color:
   PURPLE = '\033[95m'
   CYAN = '\033[96m'
   DARKCYAN = '\033[36m'
   BLUE = '\033[94m'
   GREEN = '\033[92m'
   YELLOW = '\033[93m'
   RED = '\033[91m'
   BOLD = '\033[1m'
   UNDERLINE = '\033[4m'
   END = '\033[0m'

## Obtain training data

In [None]:
df = pd.read_csv(os.path.join(directories.INTERIM_DATA_DIR,
                              filenames.TRAIN_FILENAME),
                 encoding="Latin-1", low_memory=False)

### Decision Analysis

#### Decision by Wave Size
The mean wave size for yes decisions was slightly smaller than that of the no decision, though the median wave sizes were the same for both decisions.  The anova test shows a statistically significant difference, although the practical significance indicated by the percentage of variance explained was 0.2 percent.

In [None]:
x = 'wave_size'
y = 'decision_c'
i,d,p = bivariate.analysis(df, x, y);
print(color.BOLD + "Decisions by Wave Size" + color.END)
d
print(color.BOLD + "Cramer's Association" + color.END)
i

#### Decision by Gender
Based upon the contingency table, a significantly higher number of males made positive decisions than female participants. Though the association test shows a week correlation, I would expect to see a significant result in a chi-squared hypothesis test.

In [None]:
x = 'gender'
y = 'decision_c'
i,d,p = bivariate.analysis(df, x, y);
print(color.BOLD + "Contingency Table" + color.END)
d
print(color.BOLD + "Cramer's Association" + color.END)
i

#### Decision by Age
Both the visuals and the analysis of variance show no real relationship between age and decision.

In [None]:
x = 'age'
y = 'decision_c'
i,d,p = bivariate.analysis(df, x, y);
print(color.BOLD + "Descriptive Statistics of Decision by Age" + color.END)
d
print(color.BOLD + "Analysis of Variance of Decision by Age" + color.END)
i

#### Decision by Age of Partner
Though the distributions look nearly identical suggesting little to no relationship between decision and the age of the partner.  The significance of the anova test could be attributed to the sample size. Eta squared indicates that just 0.2% of the variance in the decision was explained by the age of the partner.  Little practical significance.

In [None]:
x = 'age_o'
y = 'decision_c'
i,d,p = bivariate.analysis(df, x, y);
print(color.BOLD + "Descriptive Statistics of Decision by Age of Partner" + color.END)
d
print(color.BOLD + "Analysis of Variance of Decision by Age of Partner" + color.END)
i

#### Decision by Difference in Age
Again, the distributions look almost identical. There is a slight difference indicated by the boxplots, however the anova test indicates that there was no practical difference in decisions by the difference in age.

In [None]:
x = 'd_age'
y = 'decision_c'
i,d,p = bivariate.analysis(df, x, y);
print(color.BOLD + "Descriptive Statistics of Decision by Difference in Age" + color.END)
d
print(color.BOLD + "Analysis of Variance of Decision by Difference in Age" + color.END)
i

#### Decision by Race
Though the analysis of variance test shows a weak association, their was a difference in the proportion of yes decisions across the races.  For instance, the proportion of yes decisions amongst caucasians was significantly higher than that of black participants. This may be a consequence of the numbers of participants by race, especially if same race was a factor. There were more caucasians than the other races combined. 

In [None]:
x = 'race'
y = 'decision_c'
i,d,p = bivariate.analysis(df, x, y);
print(color.BOLD + "Descriptive Statistics of Decision by Race" + color.END)
d
print(color.BOLD + "Analysis of Variance of Decision by Race" + color.END)
i

#### Decision by Race of the Partner
Again we see a weak association between decision and race of the partner, yet stronger than the association between race and decision.

In [None]:
x = 'race_o'
y = 'decision_c'
i,d,p = bivariate.analysis(df, x, y);
print(color.BOLD + "Descriptive Statistics of Decision by Race of Partner" + color.END)
d
print(color.BOLD + "Analysis of Variance of Decision by Race of Partner" + color.END)
i

#### Decision by Same Race Indicator
There appears to be no significant difference between the proportion of yes decisions based upon same race.

In [None]:
x = 'samerace_c'
y = 'decision_c'
i,d,p = bivariate.analysis(df, x, y);
print(color.BOLD + "Descriptive Statistics of Decision by Same Race" + color.END)
d
print(color.BOLD + "Cramer's V Decision by Same Race" + color.END)
i

#### Decisions by Subject Self-Rating
Here we examine the decisions by subjects ratings of themselves along the following dimensions: 
* Attractiveness
* Sincerity
* Intelligence
* Humor
* Ambition

#### Decision by Self-Rating of Attractiveness
Overall, little practical significance in the decisions by self-rating of attractiveness. That said, there was much more variation in the self-ratings among those that made the no decision, than those who chose yes.

In [None]:
x = 'attractive'
y = 'decision_c'
i,d,p = bivariate.analysis(df, x, y);
print(color.BOLD + "Descriptive Statistics of Decision by Attractive Self-Rating" + color.END)
d
print(color.BOLD + "Analysis of Variance of Decision by Attractive Self-Rating" + color.END)
i

#### Decision by Self-Rating of Sincere
Distributions nearly identical

In [None]:
x = 'sincere'
y = 'decision_c'
i,d,p = bivariate.analysis(df, x, y);
print(color.BOLD + "Descriptive Statistics of Decision by Sincerity Self-Rating" + color.END)
d
print(color.BOLD + "Analysis of Variance of Decision by Sincerity Self-Rating" + color.END)
i

#### Decision by Self-Rating of Intelligence
Again, distributions nearly identical

In [None]:
x = 'intelligence'
y = 'decision_c'
i,d,p = bivariate.analysis(df, x, y);
print(color.BOLD + "Descriptive Statistics of Decision by Intelligence Self-Rating" + color.END)
d
print(color.BOLD + "Analysis of Variance of Decision by Intelligence Self-Rating" + color.END)
i

#### Decision by Self-Rating of Humor
Again, distributions nearly identical

In [None]:
x = 'funny'
y = 'decision_c'
i,d,p = bivariate.analysis(df, x, y);
print(color.BOLD + "Descriptive Statistics of Decision by Humor Self-Rating" + color.END)
d
print(color.BOLD + "Analysis of Variance of Decision by Humor Self-Rating" + color.END)
i

#### Decision by Self-Rating of Ambition
Again, distributions nearly identical

In [None]:
x = 'ambitious'
y = 'decision_c'
i,d,p = bivariate.analysis(df, x, y);
print(color.BOLD + "Descriptive Statistics of Decision by Ambition Self-Rating" + color.END)
d
print(color.BOLD + "Analysis of Variance of Decision by Ambition Self-Rating" + color.END)
i

#### Decisions by Rating of Partner
Here we examine the decisions by subjects ratings of their partner along the following dimensions: 
* Attractiveness
* Sincerity
* Intelligence
* Humor
* Ambition
* Shared Interests

#### Decision by Subject Rating of Partner Attractiveness
Significant difference, as expected, in the decisions based upon the attractiveness of partner.  Accounted for 24% of the total variance. 

In [None]:
x = 'attractive_partner'
y = 'decision_c'
i,d,p = bivariate.analysis(df, x, y);
print(color.BOLD + "Descriptive Statistics of Decision by Attractiveness of Partner" + color.END)
d
print(color.BOLD + "Analysis of Variance of Decision by Attractiveness of Partner" + color.END)
i

#### Decision by Subject Rating of Partner Sincerity
Significant statistical difference in decision for those found to be sincere, though this accounted for less than 5% of the variance observed.

In [None]:
x = 'sincere_partner'
y = 'decision_c'
i,d,p = bivariate.analysis(df, x, y);
print(color.BOLD + "Descriptive Statistics of Decision by Sincerity of Partner" + color.END)
d
print(color.BOLD + "Analysis of Variance of Decision by Sincerity of Partner" + color.END)
i

#### Decision by Subject Rating of Partner Inteligence
Again, statistically significant number of yes decisions for those found to be intelligent.

In [None]:
x = 'intelligence_partner'
y = 'decision_c'
i,d,p = bivariate.analysis(df, x, y);
print(color.BOLD + "Descriptive Statistics of Decision by Intelligence of Partner" + color.END)
d
print(color.BOLD + "Analysis of Variance of Decision by Intelligence of Partner" + color.END)
i

#### Decision by Subject Rating of Partner Humor
Sense of humor a significant indicator of the a decision to pursue a second date. Accounted for 18% of total variance.

In [None]:
x = 'funny_partner'
y = 'decision_c'
i,d,p = bivariate.analysis(df, x, y);
print(color.BOLD + "Descriptive Statistics of Decision by Humor of Partner" + color.END)
d
print(color.BOLD + "Analysis of Variance of Decision by Humor of Partner" + color.END)
i

#### Decision by Subject Rating of Partner Ambition
Statistically significant, but not practically significant.

In [None]:
x = 'ambitious_partner'
y = 'decision_c'
i,d,p = bivariate.analysis(df, x, y);
print(color.BOLD + "Descriptive Statistics of Decision by Ambition of Partner" + color.END)
d
print(color.BOLD + "Analysis of Variance of Decision by Ambition of Partner" + color.END)
i

#### Decision by Subject Rating of Partner Shared-Interests
Both statistically and practically signficant predictor of decision. Accounted for 17% of variance.

In [None]:
x = 'shared_interests_partner'
y = 'decision_c'
i,d,p = bivariate.analysis(df, x, y);
print(color.BOLD + "Descriptive Statistics of Decision by Partner Shared Interests" + color.END)
d
print(color.BOLD + "Analysis of Variance of Decision by Partner Shared Interests" + color.END)
i

#### Decision by Whether the Couple has Met Before
Whether the couple has met before shows a very week association with the decision.

In [None]:
x = 'met_c'
y = 'decision_c'
i,d,p = bivariate.analysis(df, x, y);
print(color.BOLD + "Descriptive Statistics of Decision by Partner Shared Interests" + color.END)
d
print(color.BOLD + "Analysis of Variance of Decision by Partner Shared Interests" + color.END)
i

#### Decision by Relative Attractiveness
This shows the decision by the subjects perception of their attractiveness vis-a-vis their perception of their partners attractiveness. This analysis shows a significant difference in decisions based upon the subject's perception of their attractiveness, relative to their perceptions of their partner's attractiveness.  In short, the mean and median difference was positive for yes decisions and negative for no decisions. This implies that subjects primarily preferred partners that they preceived to be equal to or greater than their self-preception of attractiveness.  The amount of variance explained is 14.5%.

In [None]:
x = 'rd_attractive'
y = 'decision_c'
i,d,p = bivariate.analysis(df, x, y);
print(color.BOLD + "Descriptive Statistics of Decision by Relative Attractiveness" + color.END)
d
print(color.BOLD + "Analysis of Variance of Decision by Relative Attractiveness" + color.END)
i

#### Decision by Relative Sincerety
This shows the decision by the subjects perception of their sincerity vis-a-vis their perception of their partners sincerity. This analysis shows a significant difference in decisions based upon the subject's perception of their sincerity, relative to their perceptions of their partner's sincerity. In this case, the mean and median differences were negative; although, the differences were more profound for the no decisions. This would suggest a degree of tolerance for a lower level of sincerity.

In [None]:
x = 'rd_sincere'
y = 'decision_c'
i,d,p = bivariate.analysis(df, x, y);
print(color.BOLD + "Descriptive Statistics of Decision by Relative Sincerity" + color.END)
d
print(color.BOLD + "Analysis of Variance of Decision by Relative Sincerity" + color.END)
i

#### Decision by Relative Intelligence
This shows the decision by the subjects perception of their intelligence vis-a-vis their perception of their partners intelligence. This analysis shows a significant difference in decisions based upon the subject's perception of their intelligence, relative to their perceptions of their partner's intelligence. The distribution of differences were similar to those for attractiveness.  The mean and median for the 'yes' decision were non-negative; whereas, the mean and median for the no decision were negative. But in this case, the practical significance was low with a percent variance explained of 2%. 

In [None]:
x = 'rd_intelligence'
y = 'decision_c'
i,d,p = bivariate.analysis(df, x, y);
print(color.BOLD + "Descriptive Statistics of Decision by Relative Intelligence" + color.END)
d
print(color.BOLD + "Analysis of Variance of Decision by Relative Intelligence" + color.END)
i

#### Decision by Relative Sense of Humor
This shows the decision by the subjects perception of their Sense of Humor vis-a-vis their perception of their partners Sense of Humor. This analysis shows a significant difference in decisions based upon the subject's perception of their Sense of Humor, relative to their perceptions of their partner's Sense of Humor. The means and medians were negative for both yes and no decisions, indicating that most participants considered their partners 'funnier' than themselves. Yes decisions were associated with slight differences; whereas, the no decisions were associated with subject's perceptions of their partner being 'too funny'. The difference was both statistically and practically significant.  The variance explained by the subject's perception of the difference in sense of humor wsa 14%.

In [None]:
x = 'rd_funny'
y = 'decision_c'
i,d,p = bivariate.analysis(df, x, y);
print(color.BOLD + "Descriptive Statistics of Decision by Relative Sense of Humor" + color.END)
d
print(color.BOLD + "Analysis of Variance of Decision by Relative Sense of Humor" + color.END)
i

#### Decision by Relative Ambition
This shows the decision by the subjects perception of their Ambition vis-a-vis their perception of their partners Ambition. This analysis shows a significant difference in decisions based upon the subject's perception of their Ambition, relative to their perceptions of their partner's Ambition. The mean and median difference was non-negative for the yes decisions suggesting that most of the yes decisions were for partners with equal or greater ambition. The practical significance is indicated by 1% variance explained.

In [None]:
x = 'rd_ambitious'
y = 'decision_c'
i,d,p = bivariate.analysis(df, x, y);
print(color.BOLD + "Descriptive Statistics of Decision by Relative Ambition" + color.END)
d
print(color.BOLD + "Analysis of Variance of Decision by Relative Ambition" + color.END)
i

### Decision Summary 

#### Categorical Predictors

In [None]:
independence.assoc_table(df, y='decision_c', threshold=0.1)

#### Quantitative Predictors

In [None]:
independence.aov_table(df, y='decision_c', threshold=0.03)