# Regression Case Study
-  Data : Diamonds from sns
-  How variables affect DV
-  Extracted from Linked In post
-  statistical tests and regression model on diamonds dataset from seaborn library with important concepts to data, including: Data cleaning, Data Exploration, Data visualization, OLS Regression model, Hypothesis testing including: ANOVA, Post hoc test, ANCOVA, MANOVA testing.

In [None]:
#libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

In [None]:
#data 
df = sns.load_dataset('diamonds')
df.info()

In [None]:
df.head()

In [None]:
# take log of price
df['log_price'] = np.log(df['price'])
df.head(2)

In [None]:
#subset columns
df1 = df[['color','price','log_price']]
df1.tail(2)

In [None]:
df1.info()

In [None]:
df1.columns

In [None]:
df1.shape

In [None]:
df1.describe()
#high sd for price

In [None]:
df1.price.plot(kind='hist')

In [None]:
df1.price.plot(kind='kde')

In [None]:
sns.displot(data=df1, x = 'price', kde = True)

In [None]:
#missing values - Nil
df1.isnull().sum()

In [None]:
df1.head(3)

In [None]:
colorCount = df1.color.value_counts().reset_index().rename(columns={'color':'count','index':'color'})
colorCount

In [None]:
# Categorical Values 
fig = px.bar(colorCount, x='color',y='count' )
fig.show();

In [None]:
#df1.loc[df['pop'] < 2.e6, 'country'] = 'Other countries' # Represent only large countries
fig = px.pie(colorCount, names='color', values='count', title='Colors and their Count : Pie Chart')
fig.show();

In [None]:
# Mean price per color
meanPriceColor = df1.groupby(['color'], as_index=False)['price'].agg(np.mean)
print(meanPriceColor)
fig = px.bar(meanPriceColor, x='color',y='price', color='color', title='Mean Price per Color' )
fig.show();

In [None]:
# box plot
fig = px.box(df1, x="color", y="price")
fig.show()

In [None]:
sns.boxplot(data=df1, x='color', y='price')

In [None]:
df1['price'].describe()

In [None]:
q1=df1[['price']].quantile(0.25)
q3=df1[['price']].quantile(0.75)
IQR=q3-q1
IQR, q1, q3

In [None]:
q1 - IQR, q3 + IQR

In [None]:
#outliers.sum()

In [None]:
df1.query('price < -3425')

In [None]:
df1.query('price > 9868')
#5355

In [None]:
df1.loc[df1['price'] > 9868, :]

In [None]:
df2 = df1.loc[df1['price'] <= 9868, :]
df2.shape

In [None]:
np.where(df1[(df1['price'] <= 9868)])

In [None]:
# Regression 
import statsmodels.api as st
from statsmodels.formula.api import ols

In [None]:
olsmodel = ols(formula ='log_price ~ C(color)', data=df1).fit()
# we wanted to treat explicitly as categorical, we could have done so by using the C() operator:

In [None]:
olsmodel.summary()

In [None]:
#anova
st.stats.anova_lm(olsmodel, type=2)

-  t test to state : mean is equal among 2 groups of data.
-  does color affect the price of diamong
-  p value tells if can reject Ho : p < .05 : Significant difference due to color -> Reject Ho : Color affect the mean price
-  F value tells how much color is affect the outcome (log_price)

In [None]:
df.columns

In [None]:
#add another column, ie cut 
df1 = df1.join(df['cut'])
df1.head(3)

In [None]:
sns.boxplot(data=df1, x='cut', y='log_price')

In [None]:
sns.barplot(data=df1.cut.value_counts().reset_index().rename(columns={'cut':'count','index':'cut'}), x='cut',y='count')

In [None]:
df1.groupby(['cut'], as_index=False)['log_price'].mean()

In [None]:
sns.barplot(data=df1.groupby(['cut'], as_index=False)['log_price'].mean(), x='cut',y='log_price')
#almost same with cut type

In [None]:
#second model
olsmodel2 = ols(formula = 'log_price ~ C(color) + C(cut) + C(color):C(cut)', data=df1).fit()

In [None]:
olsmodel2.summary()

## hypothesis setting
- Color
    -  Ho : No difference in price due to color, Ha : Difference in price due to color
- Cut
    -  Ho : No difference in price due to cut, Ha : Difference in price due to cut
-  Color + Cut Interaction/ Combination
    -  Ho : No difference in price due to cut & color, Ha : Difference in price due to cut & color



In [None]:
st.stats.anova_lm(olsmodel2, type=2)

In [None]:
from statsmodels.stats.multicomp import pairwise_tukeyhsd

In [None]:
tukeyhsd = pairwise_tukeyhsd(endog = df1['log_price'], groups = df1['color'], alpha = .05)

In [None]:
tukeyhsd.summary()
# Reject Column
##  True - Reject Ho in favour of Ha
##  False - Accept Ho

## Ho
-  Ho : No Difference in price due to colors only
    -  D & E
    -  F & G
-  Rest have difference in prices due to colors

In [None]:
# add carat to the model
df1 = df1.join(df['carat'])
df1.head()

In [None]:
olsmodel3 = ols('log_price ~ C(color) + carat', df1).fit()
olsmodel3.summary()

In [None]:
#ancova
ancova_table = st.stats.anova_lm(olsmodel3, type=2)
ancova_table

In [None]:
sns.lmplot(data = df1, x='carat', y='log_price', hue='color').set(title = 'ANCOVA plot with multiple covariates')

In [None]:
#!pip install pingouin
from pingouin import ancova

In [None]:
ancova(data=df1, dv='log_price', covar='carat', between='color')

### Ancova summary
-   p-unc (uncorrected) < .05,-> Reject Ho that each color results in same avg log_price even after controlling carat variable
-   Conclude that color itself affect the price of diamong

In [None]:
df1.groupby('color')['log_price'].describe()

In [None]:
df1.groupby('color')['carat'].describe()

In [None]:
sns.boxplot(data=df1, x='color', y='carat')
#outliers seen

In [None]:
df1.carat.hist()

In [None]:
# MANOVA
from statsmodels.multivariate.manova import MANOVA
fit = MANOVA.from_formula('log_price + carat ~ color', data =df1)
print(fit.mv_test())

In [None]:
# p value < .005 
# strong association between color & caret + log_price 

In [None]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as lda

In [None]:
sampleDF1 = df1.sample(n=100, random_state=32190).reset_index()
sampleDF1.shape

In [None]:
sampleDF1.head(2)

In [None]:
X = sampleDF1[['log_price', 'carat']]
y = sampleDF1['color']
X.shape, y.shape

In [None]:
sampleDF1.isnull().sum()

In [None]:
X_ne = pd.DataFrame(lda().fit(X=X, y=y).transform(X), columns=['lda1', 'lda2'])

In [None]:
X_ne.head()

In [None]:
X_ne['color'] = sampleDF1[['color']]
X_ne.head(2)

In [None]:
sns.scatterplot(data= X_ne, x='lda1', y='lda2', hue='color')

# End here
-  https://www.linkedin.com/feed/update/urn:li:activity:7088692799499362304/