In [1]:
import pandas as pd
import numpy as np
import os
from numpy import random
import scipy 
from scipy import stats
from matplotlib import pyplot as plt
from scipy.stats import binom
from scipy.stats import poisson
from scipy.stats import norm ,t
import seaborn as sns
import pylab
from pylab import legend,plot,show,title,xlabel,ylabel
from random import choice
import statsmodels
from statsmodels import stats
from statsmodels.stats import weightstats as ssw
import statsmodels.api as sm
from statsmodels.formula.api import ols
import statsmodels.stats.multicomp
from statsmodels.stats.multicomp import pairwise_tukeyhsd
from statsmodels.stats import proportion as ssp
from scipy.stats import chisquare
from scipy.stats import chi2_contingency


from sklearn.model_selection import train_test_split

- No of Dummy columns = No of levels - 1
- Reference event -> gives the removed column in above 

In [3]:
df_sal = pd.read_excel('CDAC_Databook.xlsx', sheet_name='salaries')

In [4]:
df_sal.head()

Unnamed: 0,rank,discipline,yrs_phd,yrs_service,gender,salary
0,Prof,B,19,18,Male,139750
1,Prof,B,20,16,Male,173200
2,AsstProf,B,4,3,Male,79750
3,Prof,B,45,39,Male,115000
4,Prof,B,40,41,Male,141500


In [5]:
df_sal = df_sal[['gender', 'yrs_service', 'salary']]

In [6]:
df_sal.head()

Unnamed: 0,gender,yrs_service,salary
0,Male,18,139750
1,Male,16,173200
2,Male,3,79750
3,Male,39,115000
4,Male,41,141500


In [7]:
df_sal.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 397 entries, 0 to 396
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   gender       397 non-null    object
 1   yrs_service  397 non-null    int64 
 2   salary       397 non-null    int64 
dtypes: int64(2), object(1)
memory usage: 9.4+ KB


In [8]:
df_sal.gender.value_counts()

gender
Male      358
Female     39
Name: count, dtype: int64

In [9]:
# 1. Generate dummy columns
gen_dum = pd.get_dummies(df_sal['gender'], drop_first=True).astype(int)

- ```drop_first=True``` --> First value alphabetically will be taken as reference event
- Also called as **"One Hot Encoding"**

In [11]:
# 2. Remove the original Column
df_sal = df_sal.drop('gender', axis = 1)

In [12]:
# 3. Append the dummy column(s) to the dataset
df_sal = pd.concat([df_sal, gen_dum], axis=1)

In [13]:
df_sal

Unnamed: 0,yrs_service,salary,Male
0,18,139750,1
1,16,173200,1
2,3,79750,1
3,39,115000,1
4,41,141500,1
...,...,...,...
392,30,103106,1
393,19,150564,1
394,25,101738,1
395,15,95329,1


In [14]:
# 4. Splitting the dataset
x_train, x_test, y_train, y_test = train_test_split(df_sal.drop('salary', axis = 1), df_sal['salary'], test_size=0.2, random_state=20)

In [15]:
# 5. Add the constant value
x_train = sm.add_constant(x_train, prepend=False)

In [16]:
# 6. Create the model
mod2 = sm.OLS(y_train, x_train).fit()

In [17]:
# 7. Print the summary
print(mod2.summary())

                            OLS Regression Results                            
Dep. Variable:                 salary   R-squared:                       0.121
Model:                            OLS   Adj. R-squared:                  0.115
Method:                 Least Squares   F-statistic:                     21.52
Date:                Thu, 19 Dec 2024   Prob (F-statistic):           1.75e-09
Time:                        09:22:50   Log-Likelihood:                -3698.4
No. Observations:                 317   AIC:                             7403.
Df Residuals:                     314   BIC:                             7414.
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                  coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------
yrs_service   701.6382    125.946      5.571      

- The level which has beed considered as ref. event will not appear in the output table above. (Female)
- Male ---> Target Event
- **Ho (Gender)** : The change in response when factor/variable changes from reference event/variable to target event/variable is equal to 0.
- In this example, change in salary when gender changes from female to male = 0

In [19]:
df_sal

Unnamed: 0,yrs_service,salary,Male
0,18,139750,1
1,16,173200,1
2,3,79750,1
3,39,115000,1
4,41,141500,1
...,...,...,...
392,30,103106,1
393,19,150564,1
394,25,101738,1
395,15,95329,1


In [47]:
df_sal = pd.read_excel('CDAC_Databook.xlsx', sheet_name='salaries')

In [49]:
df_sal.head()

Unnamed: 0,rank,discipline,yrs_phd,yrs_service,gender,salary
0,Prof,B,19,18,Male,139750
1,Prof,B,20,16,Male,173200
2,AsstProf,B,4,3,Male,79750
3,Prof,B,45,39,Male,115000
4,Prof,B,40,41,Male,141500


In [51]:
df_sal = df_sal[['rank', 'yrs_service', 'salary']]

In [53]:
# 1. Generate dummy columns
rank_dum = pd.get_dummies(df_sal['rank'], drop_first=True).astype(int)

In [55]:
# 2. Remove the original Column
df_sal = df_sal.drop('rank', axis = 1)

In [57]:
# 3. Append the dummy column(s) to the dataset
df_sal = pd.concat([df_sal, rank_dum], axis=1)

In [59]:
df_sal

Unnamed: 0,yrs_service,salary,AsstProf,Prof
0,18,139750,0,1
1,16,173200,0,1
2,3,79750,1,0
3,39,115000,0,1
4,41,141500,0,1
...,...,...,...,...
392,30,103106,0,1
393,19,150564,0,1
394,25,101738,0,1
395,15,95329,0,1


In [61]:
# 4. Splitting the dataset
x_train, x_test, y_train, y_test = train_test_split(df_sal.drop('salary', axis = 1), df_sal['salary'], test_size=0.2, random_state=20)

In [63]:
# 5. Add the constant value
x_train = sm.add_constant(x_train, prepend=False)

In [65]:
x_train.head()

Unnamed: 0,yrs_service,AsstProf,Prof,const
90,5,1,0,1.0
329,23,0,1,1.0
156,18,0,0,1.0
360,11,0,1,1.0
117,36,0,1,1.0


In [67]:
y_train.head()

90      97032
329    134778
156    113341
360    121946
117    117515
Name: salary, dtype: int64

In [69]:
# 6. Create the model
mod3 = sm.OLS(y_train, x_train).fit()

In [71]:
print(mod3.summary())

                            OLS Regression Results                            
Dep. Variable:                 salary   R-squared:                       0.379
Model:                            OLS   Adj. R-squared:                  0.373
Method:                 Least Squares   F-statistic:                     63.71
Date:                Thu, 19 Dec 2024   Prob (F-statistic):           3.54e-32
Time:                        09:25:55   Log-Likelihood:                -3643.3
No. Observations:                 317   AIC:                             7295.
Df Residuals:                     313   BIC:                             7310.
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                  coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------
yrs_service  -161.8155    131.085     -1.234      

- Change in salary from associateprof to assistantProf -> m2 (-13,000)
- Change in salary from associateprof to Prof -> m3

- m3 = 0
- Reject (p-val < 0.05)
- m3 != 0
- Change in salary from associateprof to Prof -> m3
- While all other things are constant, change in salary from associateprof to Prof is significant (+35,000) increase

- assoc ---> 94,000
- AsstProf ---> 94,000 - 13,000 = 81000
- prof  ---> 94,000 + 35,000 = 129000