In [1]:
import numpy as np
from scipy import stats
import statsmodels.api as sm
from statsmodels.tsa.stattools import adfuller
import pandas as pd

In [2]:
def warn(*args, **kwargs):
     pass
import warnings
warnings.warn = warn
warnings.filterwarnings('ignore')

In [3]:
ownership = pd.read_csv("../../data/final_data/final_ownership_data.csv")
sales = pd.read_csv("../../data/final_data/final_sales_data.csv")

In [4]:
ownership.columns

Index(['Unnamed: 0', '_c0', 'Occupation', 'Annual_Income', 'Credit_Score',
       'Years_of_Employment', 'Finance_Status', 'Car', 'Number_of_Children'],
      dtype='object')

In [5]:
sales.columns

Index(['Unnamed: 0', '_c0', 'Car_id', 'Date', 'Gender', 'Annual_Income',
       'Dealer_Name', 'Company', 'Model', 'Engine', 'Transmission', 'Color',
       'Price', 'Dealer_No ', 'Body_Style', 'Phone', 'Dealer_Region', 'Month',
       'Year'],
      dtype='object')

In [6]:
def t_test(df, column1, column2):
     """
     t-test
     """
     # Separate years of employment for car owners and non-owners
     car_owners = df[df[column1] == True][column2]
     non_car_owners = df[df[column1] == False][column2]

     # Perform independent t-test
     t_stat, p_value = stats.ttest_ind(car_owners, non_car_owners)

     print(f"T-statistic: {t_stat:.4f}")
     print(f"P-value: {p_value:.4f}")

     return t_stat, p_value


def chi_square_test(df, column1, column2):
     """
     Performs chi-square test of independence between two categorical variables
     """
     # Create contingency table
     contingency = pd.crosstab(df[column1], df[column2])

     # Perform chi-square test
     chi2_stat, p_value, dof, expected = stats.chi2_contingency(contingency)

     print(f"Contingency Table:\n{contingency}\n")
     print(f"Chi-square statistic: {chi2_stat:.4f}")
     print(f"p-value: {p_value:.4f}")
     print(f"Degrees of freedom: {dof}")
     return chi2_stat, p_value, dof, expected

def two_way_anova(df, dependent_var, factor1, factor2):
     """
     Performs two-way ANOVA test
     """
     # Create groups based on factors
     groups = []
     for f1 in df[factor1].unique():
          for f2 in df[factor2].unique():
               group = df[(df[factor1] == f1) & (df[factor2] == f2)][dependent_var]
               groups.append(group)

     # Perform ANOVA
     f_stat, p_value = stats.f_oneway(*groups)

     print(f"F-statistic: {f_stat:.4f}")
     print(f"p-value: {p_value:.4f}")
     return f_stat, p_value

def mann_whitney_test(df, numeric_column, binary_column):
     """
     Performs Mann-Whitney U test between two groups
     """
     group1 = df[df[binary_column] == True][numeric_column]
     group2 = df[df[binary_column] == False][numeric_column]

     # Perform test
     stat, p_value = stats.mannwhitneyu(group1, group2, alternative='two-sided')

     print(f"Mann-Whitney U statistic: {stat:.4f}")
     print(f"p-value: {p_value:.4f}")
     return stat, p_value

def multiple_linear_regression(df, dependent_var, independent_vars):
     """
     Performs multiple linear regression
     """
     # Prepare the data
     X = df[independent_vars]
     # Add constant for intercept
     X = sm.add_constant(X)
     y = df[dependent_var]

     # Fit the model
     model = sm.OLS(y, X).fit()

     print(model.summary())
     return model

def time_series_analysis(df, date_column, value_column):
     """
     Performs Augmented Dickey-Fuller test for stationarity
     """
     # Ensure data is sorted by date
     df = df.sort_values(date_column)

     # Perform ADF test
     result = adfuller(df[value_column].values)

     print('ADF Statistic:', result[0])
     print('p-value:', result[1])
     print('Critical values:')
     for key, value in result[4].items():
          print('\t%s: %.3f' % (key, value))
     return result


In [7]:
chi_square_test(ownership, 'Occupation', 'Car')

Contingency Table:
Car                      No  Yes
Occupation                      
Account Executive         0    3
Account Manager           0    2
Accountant                1    8
Architect                 1   10
Art Director              0    1
...                      ..  ...
Veterinarian Technician   1    0
Waiter/Waitress           1    0
Web Designer              4    3
Web Developer             1    6
Writer                    7    0

[116 rows x 2 columns]

Chi-square statistic: 273.4004
p-value: 0.0000
Degrees of freedom: 115


(273.4004148727985,
 6.337285346434918e-15,
 115,
 array([[ 1.10606061,  1.89393939],
        [ 0.73737374,  1.26262626],
        [ 3.31818182,  5.68181818],
        [ 4.05555556,  6.94444444],
        [ 0.36868687,  0.63131313],
        [ 0.73737374,  1.26262626],
        [ 0.36868687,  0.63131313],
        [ 1.10606061,  1.89393939],
        [ 0.36868687,  0.63131313],
        [ 0.36868687,  0.63131313],
        [ 1.10606061,  1.89393939],
        [ 0.36868687,  0.63131313],
        [ 0.73737374,  1.26262626],
        [ 1.47474747,  2.52525253],
        [ 8.84848485, 15.15151515],
        [ 0.36868687,  0.63131313],
        [ 0.36868687,  0.63131313],
        [ 0.36868687,  0.63131313],
        [ 1.47474747,  2.52525253],
        [ 0.36868687,  0.63131313],
        [ 0.73737374,  1.26262626],
        [ 2.21212121,  3.78787879],
        [ 0.36868687,  0.63131313],
        [ 1.10606061,  1.89393939],
        [ 1.84343434,  3.15656566],
        [ 1.10606061,  1.89393939],
        [ 2.58

In [8]:
chi_square_test(ownership, 'Finance_Status', 'Car')

Contingency Table:
Car             No  Yes
Finance_Status         
Excellent        3    8
Fair            10    4
Good             3   10
Poor             4    0
Stable          49  220
Unknown         10    3
Unstable        67    5

Chi-square statistic: 162.3593
p-value: 0.0000
Degrees of freedom: 6


(162.35931369073967,
 1.8736391064028802e-32,
 6,
 array([[  4.05555556,   6.94444444],
        [  5.16161616,   8.83838384],
        [  4.79292929,   8.20707071],
        [  1.47474747,   2.52525253],
        [ 99.17676768, 169.82323232],
        [  4.79292929,   8.20707071],
        [ 26.54545455,  45.45454545]]))

In [9]:
ownership_tt = ownership.__deepcopy__()
ownership_tt['Car']= ownership_tt['Car'].replace({'Yes': 1, 'No': 0}).astype(int)

In [10]:
t_test(ownership_tt, 'Car', 'Years_of_Employment')

T-statistic: 16.1754
P-value: 0.0000


(16.175410175636937, 1.70182323635391e-45)

In [11]:
chi_square_test(ownership, 'Car', 'Number_of_Children')

Contingency Table:
Number_of_Children  -1   0   1   2   3   4
Car                                       
No                  42  38  32  25   8   1
Yes                 37  74  68  62   9   0

Chi-square statistic: 15.3907
p-value: 0.0088
Degrees of freedom: 5


(15.390747852416434,
 0.008816920421520028,
 5,
 array([[29.12626263, 41.29292929, 36.86868687, 32.07575758,  6.26767677,
          0.36868687],
        [49.87373737, 70.70707071, 63.13131313, 54.92424242, 10.73232323,
          0.63131313]]))

In [12]:
mann_whitney_test(ownership_tt, 'Annual_Income', 'Car')

Mann-Whitney U statistic: 32367.5000
p-value: 0.0000


(32367.5, 8.075407838854787e-38)

In [13]:
# multiple_linear_regression(sales, 'Gender', ['Annual_Income', 'Price']) # WIP

ValueError: Pandas data cast to numpy dtype of object. Check input data with np.asarray(data).