In [30]:
import numpy as np
from scipy import stats
import statsmodels.api as sm
from statsmodels.tsa.stattools import adfuller
from statsmodels.stats.multicomp import pairwise_tukeyhsd
from statsmodels.formula.api import logit
import pandas as pd

In [31]:
def warn(*args, **kwargs):
     pass
import warnings
warnings.warn = warn
warnings.filterwarnings('ignore')

In [32]:
ownership = pd.read_csv("../../data/final_data/final_ownership_data.csv")
sales = pd.read_csv("../../data/final_data/final_sales_data.csv")

In [33]:
ownership.columns

Index(['Unnamed: 0', '_c0', 'Occupation', 'Annual_Income', 'Credit_Score',
       'Years_of_Employment', 'Finance_Status', 'Car', 'Number_of_Children'],
      dtype='object')

In [34]:
sales.columns

Index(['Unnamed: 0', '_c0', 'Car_id', 'Date', 'Gender', 'Annual_Income',
       'Dealer_Name', 'Company', 'Model', 'Engine', 'Transmission', 'Color',
       'Price', 'Dealer_No ', 'Body_Style', 'Phone', 'Dealer_Region', 'Month',
       'Year'],
      dtype='object')

In [35]:
def t_test(df, column1, column2):
     """
     t-test
     """
     # Separate years of employment for car owners and non-owners
     car_owners = df[df[column1] == True][column2]
     non_car_owners = df[df[column1] == False][column2]

     # Perform independent t-test
     t_stat, p_value = stats.ttest_ind(car_owners, non_car_owners)

     print(f"T-statistic: {t_stat:.4f}")
     print(f"P-value: {p_value:.4f}")

     return t_stat, p_value


def chi_square_test(df, column1, column2):
     """
     Performs chi-square test of independence between two categorical variables
     """
     # Create contingency table
     contingency = pd.crosstab(df[column1], df[column2])

     # Perform chi-square test
     chi2_stat, p_value, dof, expected = stats.chi2_contingency(contingency)

     print(f"Contingency Table:\n{contingency}\n")
     print(f"Chi-square statistic: {chi2_stat:.4f}")
     print(f"p-value: {p_value:.4f}")
     print(f"Degrees of freedom: {dof}")
     return chi2_stat, p_value, dof, expected

def one_way_anova(df, numerical_column, categorical_column):
     """
     Analyzes if there are significant differences in numerical_column across categorical_column using one-way ANOVA
     """

     dealer_groups = [group[numerical_column].values for name, group in df.groupby(categorical_column)]

     f_stat, p_value = stats.f_oneway(*dealer_groups)

     dealer_stats = df.groupby(categorical_column)[numerical_column].agg([
          ('mean', 'mean'),
          ('std', 'std'),
          ('count', 'count')
     ]).round(2)

     dealer_stats = dealer_stats.sort_values('mean', ascending=False)

     print("\nDealer Performance Analysis (One-way ANOVA)")
     print("-" * 50)
     print(f"\nDescriptive Statistics by {categorical_column}:")
     print(dealer_stats)
     print("\nANOVA Test Results:")
     print(f"F-statistic: {f_stat:.4f}")
     print(f"p-value: {p_value:.4f}")

     if p_value < 0.05:
          print("\nSince there are significant differences, performing Tukey's HSD test...")

          df_for_tukey = pd.DataFrame({
               numerical_column: df[numerical_column],
               categorical_column: df[categorical_column]
          })

          tukey = pairwise_tukeyhsd(df_for_tukey[numerical_column], df_for_tukey[categorical_column])

          print("\nTukey's HSD Test Results:")
          print(tukey)

     return f_stat, p_value, dealer_stats

def mann_whitney_test(df, numeric_column, binary_column):
     """
     Performs Mann-Whitney U test between two groups
     """
     group1 = df[df[binary_column] == True][numeric_column]
     group2 = df[df[binary_column] == False][numeric_column]

     # Perform test
     stat, p_value = stats.mannwhitneyu(group1, group2, alternative='two-sided')

     print(f"Mann-Whitney U statistic: {stat:.4f}")
     print(f"p-value: {p_value:.4f}")
     return stat, p_value

def multiple_linear_regression(df, dependent_var, independent_vars):
     """
     Performs multiple linear regression
     """
     # Prepare the data
     X = df[independent_vars]
     # Add constant for intercept
     X = sm.add_constant(X)
     y = df[dependent_var]

     # Fit the model
     model = sm.OLS(y, X).fit()

     print(model.summary())
     return model

def time_series_analysis(df, date_column, value_column):
     """
     Performs Augmented Dickey-Fuller test for stationarity
     """
     # Ensure data is sorted by date
     df = df.sort_values(date_column)

     # Perform ADF test
     result = adfuller(df[value_column].values)

     print('ADF Statistic:', result[0])
     print('p-value:', result[1])
     print('Critical values:')
     for key, value in result[4].items():
          print('\t%s: %.3f' % (key, value))
     return result


In [36]:
chi_square_test(ownership, 'Occupation', 'Car')

Contingency Table:
Car                      No  Yes
Occupation                      
Account Executive         0    3
Account Manager           0    2
Accountant                1    8
Architect                 1   10
Art Director              0    1
...                      ..  ...
Veterinarian Technician   1    0
Waiter/Waitress           1    0
Web Designer              4    3
Web Developer             1    6
Writer                    7    0

[116 rows x 2 columns]

Chi-square statistic: 273.4004
p-value: 0.0000
Degrees of freedom: 115


(273.4004148727985,
 6.337285346434918e-15,
 115,
 array([[ 1.10606061,  1.89393939],
        [ 0.73737374,  1.26262626],
        [ 3.31818182,  5.68181818],
        [ 4.05555556,  6.94444444],
        [ 0.36868687,  0.63131313],
        [ 0.73737374,  1.26262626],
        [ 0.36868687,  0.63131313],
        [ 1.10606061,  1.89393939],
        [ 0.36868687,  0.63131313],
        [ 0.36868687,  0.63131313],
        [ 1.10606061,  1.89393939],
        [ 0.36868687,  0.63131313],
        [ 0.73737374,  1.26262626],
        [ 1.47474747,  2.52525253],
        [ 8.84848485, 15.15151515],
        [ 0.36868687,  0.63131313],
        [ 0.36868687,  0.63131313],
        [ 0.36868687,  0.63131313],
        [ 1.47474747,  2.52525253],
        [ 0.36868687,  0.63131313],
        [ 0.73737374,  1.26262626],
        [ 2.21212121,  3.78787879],
        [ 0.36868687,  0.63131313],
        [ 1.10606061,  1.89393939],
        [ 1.84343434,  3.15656566],
        [ 1.10606061,  1.89393939],
        [ 2.58

In [37]:
chi_square_test(ownership, 'Finance_Status', 'Car')

Contingency Table:
Car             No  Yes
Finance_Status         
Excellent        3    8
Fair            10    4
Good             3   10
Poor             4    0
Stable          49  220
Unknown         10    3
Unstable        67    5

Chi-square statistic: 162.3593
p-value: 0.0000
Degrees of freedom: 6


(162.35931369073967,
 1.8736391064028802e-32,
 6,
 array([[  4.05555556,   6.94444444],
        [  5.16161616,   8.83838384],
        [  4.79292929,   8.20707071],
        [  1.47474747,   2.52525253],
        [ 99.17676768, 169.82323232],
        [  4.79292929,   8.20707071],
        [ 26.54545455,  45.45454545]]))

In [38]:
ownership_tt = ownership.__deepcopy__()
ownership_tt['Car']= ownership_tt['Car'].replace({'Yes': 1, 'No': 0}).astype(int)

In [39]:
t_test(ownership_tt, 'Car', 'Years_of_Employment')

T-statistic: 16.1754
P-value: 0.0000


(16.175410175636937, 1.70182323635391e-45)

In [40]:
chi_square_test(ownership, 'Car', 'Number_of_Children')

Contingency Table:
Number_of_Children  -1   0   1   2   3   4
Car                                       
No                  42  38  32  25   8   1
Yes                 37  74  68  62   9   0

Chi-square statistic: 15.3907
p-value: 0.0088
Degrees of freedom: 5


(15.390747852416434,
 0.008816920421520028,
 5,
 array([[29.12626263, 41.29292929, 36.86868687, 32.07575758,  6.26767677,
          0.36868687],
        [49.87373737, 70.70707071, 63.13131313, 54.92424242, 10.73232323,
          0.63131313]]))

In [41]:
mann_whitney_test(ownership_tt, 'Annual_Income', 'Car')

Mann-Whitney U statistic: 32367.5000
p-value: 0.0000


(32367.5, 8.075407838854787e-38)

In [42]:
sales_mlr = sales.__deepcopy__()
sales_mlr['Gender'] = sales_mlr['Gender'].replace({'Male': 1, 'Female': 0}).astype(int)

In [43]:
multiple_linear_regression(sales_mlr, 'Gender', ['Annual_Income', 'Price'])

                            OLS Regression Results                            
Dep. Variable:                 Gender   R-squared:                       0.003
Model:                            OLS   Adj. R-squared:                  0.003
Method:                 Least Squares   F-statistic:                     35.85
Date:                Fri, 29 Nov 2024   Prob (F-statistic):           2.84e-16
Time:                        13:23:36   Log-Likelihood:                -12565.
No. Observations:               23906   AIC:                         2.514e+04
Df Residuals:                   23903   BIC:                         2.516e+04
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                    coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------
const             0.7663      0.006    119.298

<statsmodels.regression.linear_model.RegressionResultsWrapper at 0x2c200c96c90>

In [44]:
chi_square_test(sales, 'Gender', 'Dealer_Name')

Contingency Table:
Dealer_Name  Buddy Storbeck's Diesel Service Inc  C & M Motors Inc  \
Gender                                                               
Female                                       118               133   
Male                                         509               492   

Dealer_Name  Capitol KIA  Chrysler Plymouth  Chrysler of Tri-Cities  \
Gender                                                                
Female               140                151                     136   
Male                 488                474                     490   

Dealer_Name  Classic Chevy  Clay Johnson Auto Sales  Diehl Motor CO Inc  \
Gender                                                                    
Female                 129                      136                 138   
Male                   494                      491                 486   

Dealer_Name  Enterprise Rent A Car  Gartner Buick Hyundai Saab  ...  \
Gender                                     

(29.424176225554834,
 0.3406370281512807,
 27,
 array([[ 133.97122061,  133.5438802 ,  134.18489082,  133.5438802 ,
          133.75755041,  133.11653978,  133.97122061,  133.33020999,
          133.5438802 ,  134.18489082,  134.39856103,  133.97122061,
          134.39856103,  133.75755041,  135.25324186,  134.39856103,
          134.61223124,  134.18489082,  281.61733456,  280.54898352,
          267.72877102,  266.66041998,  267.3014306 ,  266.23307956,
          266.87409019,  265.59206894,  266.87409019,  266.44674977],
        [ 493.02877939,  491.4561198 ,  493.81510918,  491.4561198 ,
          492.24244959,  489.88346022,  493.02877939,  490.66979001,
          491.4561198 ,  493.81510918,  494.60143897,  493.02877939,
          494.60143897,  492.24244959,  497.74675814,  494.60143897,
          495.38776876,  493.81510918, 1036.38266544, 1032.45101648,
          985.27122898,  981.33958002,  983.6985694 ,  979.76692044,
          982.12590981,  977.40793106,  982.12590981,  

In [45]:
time_series_analysis(sales, 'Date', 'Price')

ADF Statistic: -108.8116803559421
p-value: 0.0
Critical values:
	1%: -3.431
	5%: -2.862
	10%: -2.567


(-108.8116803559421,
 0.0,
 1,
 23904,
 {'1%': -3.4306235944764296,
  '5%': -2.8616609202307246,
  '10%': -2.566834362345704},
 525842.0397965112)

In [47]:
one_way_anova(sales, 'Price', 'Dealer_Name')


Dealer Performance Analysis (One-way ANOVA)
--------------------------------------------------

Descriptive Statistics by Dealer_Name:
                                                     mean       std  count
Dealer_Name                                                               
U-Haul CO                                        28769.92  15187.62   1247
Classic Chevy                                    28602.01  15310.96    623
Rabun Used Car Sales                             28527.54  15344.50   1313
Iceberg Rentals                                  28522.96  15016.38    627
Enterprise Rent A Car                            28312.58  14983.85    625
Scrivener Performance Engineering                28297.37  15342.57   1246
Gartner Buick Hyundai Saab                       28247.62  14948.50    628
Saab-Belle Dodge                                 28190.14  14720.69   1251
Capitol KIA                                      28189.70  15037.23    628
Race Car Help                          

(0.4343163037926954,
 0.995264526609837,
                                                      mean       std  count
 Dealer_Name                                                               
 U-Haul CO                                        28769.92  15187.62   1247
 Classic Chevy                                    28602.01  15310.96    623
 Rabun Used Car Sales                             28527.54  15344.50   1313
 Iceberg Rentals                                  28522.96  15016.38    627
 Enterprise Rent A Car                            28312.58  14983.85    625
 Scrivener Performance Engineering                28297.37  15342.57   1246
 Gartner Buick Hyundai Saab                       28247.62  14948.50    628
 Saab-Belle Dodge                                 28190.14  14720.69   1251
 Capitol KIA                                      28189.70  15037.23    628
 Race Car Help                                    28163.37  15102.15   1253
 Chrysler of Tri-Cities                        

In [48]:
chi_square_test(sales, 'Gender', 'Body_Style')

Contingency Table:
Body_Style  Hardtop  Hatchback  Passenger   SUV  Sedan
Gender                                                
Female          647       1298        883  1335    945
Male           2324       4830       3062  5039   3543

Chi-square statistic: 3.7811
p-value: 0.4364
Degrees of freedom: 4


(3.7811059955061026,
 0.43643965301514065,
 4,
 array([[ 634.81418891, 1309.37103656,  842.92897181, 1361.93390781,
          958.95189492],
        [2336.18581109, 4818.62896344, 3102.07102819, 5012.06609219,
         3529.04810508]]))

In [49]:
one_way_anova(sales, 'Price', 'Dealer_Region')


Dealer Performance Analysis (One-way ANOVA)
--------------------------------------------------

Descriptive Statistics by Dealer_Region:
                    mean       std  count
Dealer_Region                            
Texas           28341.60  14903.88   4135
Colorado        28334.63  15026.21   3130
South Carolina  28180.82  15101.54   3128
Washington      28119.04  14659.32   3131
Arizona         27954.96  14902.92   3433
Connecticut     27856.34  14619.84   3128
Wisconsin       27833.35  14345.00   3821

ANOVA Test Results:
F-statistic: 0.7335
p-value: 0.6226


(0.7334944054339311,
 0.6225891310153794,
                     mean       std  count
 Dealer_Region                            
 Texas           28341.60  14903.88   4135
 Colorado        28334.63  15026.21   3130
 South Carolina  28180.82  15101.54   3128
 Washington      28119.04  14659.32   3131
 Arizona         27954.96  14902.92   3433
 Connecticut     27856.34  14619.84   3128
 Wisconsin       27833.35  14345.00   3821)