<a href="https://colab.research.google.com/github/FahimS45/Python_mini_projects/blob/master/Statistical_Exploration_of_Quality_Factors.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **A Statistical project regarding the analysis of Wine Quality**

In [None]:
# Importing all necessary modules
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import scipy.stats as stats

In [None]:
# Downloading the "Wine Quality" dataset from UC Irvine Machine Learning Repository
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv"

# Inspection
data = pd.read_csv(url, sep=';')

# print(data)

# **Descriptive Statistics**

In [None]:
# Describing the data
data.describe()

In [None]:
# List number of unique values per column
for i in data.keys():
  print(f"{i} has {len(np.unique(data[i]))} unique values")

In [None]:
# Ploting some data using matplotlib and seaborn
fig,ax = plt.subplots(1,figsize=(17,4))
ax = sns.boxplot(data=data)
ax.set_xticklabels(ax.get_xticklabels(),rotation=45)
plt.show()

In [None]:
# Removing rows with outliers
data = data[data['total sulfur dioxide']<200]

fig,ax = plt.subplots(1,figsize=(17,4))
ax = sns.boxplot(data=data)
ax.set_xticklabels(ax.get_xticklabels(),rotation=45)
plt.show()

In [None]:
# Some exploration
sns.scatterplot(x=data['residual sugar'],y=data['alcohol'])
plt.show()

# Pairwise plots
cols2plot = ['fixed acidity','volatile acidity','citric acid','quality']
sns.pairplot(data[cols2plot],kind='reg',hue='quality')
plt.show()

# **Conducting T-test**

In [None]:
# T-test of volatile acidity on quality=3 vs. 8

x = data['volatile acidity'][data['quality']==3]
y = data['volatile acidity'][data['quality']==8]
ttest = stats.ttest_ind(x,y)

# Showing the data in a plot
plt.plot(np.random.randn(len(x))/30,x,'o', 1+np.random.randn(len(y))/30,y, 'o')
# Some random number is generated to add small amount of random noise
# The purpose of adding random numbers is to prevent overlapping of data points with identical or very close values.

plt.xlim([-1,2])
plt.xticks([0,1],labels=['Qual 3','Qual 8'])
plt.title(f't={ttest[0]:.2f}, p={ttest[1]:.5f}')
# t = t-value; p = p-value(probability)
plt.ylabel('volatile acidity')
plt.show()

In [None]:
qualcounts = np.zeros(6)

# Gathering counts
n = 0
for i in range(3,9):
  qualcounts[n] = len( data[data['quality']==i] )
  n += 1

# Showing in a bar plot
plt.bar(range(3,9), qualcounts)
plt.xlabel('Quality rating')
plt.ylabel('Count')
plt.show()

In [None]:
# T-test of volatile acidity on quality=3/4 vs. 7/8

x = data['volatile acidity'][(data['quality']==3) | (data['quality']==4)]
y = data['volatile acidity'][(data['quality']==7) | (data['quality']==8)]
ttest = stats.ttest_ind(x,y )

plt.plot(np.random.randn(len(x))/30,x,'o',
         1+np.random.randn(len(y))/30,y, 'o',markeredgecolor='k')
plt.xlim([-1,2])
plt.xticks([0,1],labels=['Qual 3+4','Qual 7+8'])
plt.title(f't={ttest[0]:.2f}, p={ttest[1]:.5f}')
plt.ylabel('volatile acidity')
plt.show()


# **Regression Analysis**

***Multiple regression***

In [None]:
import statsmodels.api as sm

In [None]:
# Predicting quality from all other columns
dep_var  = data['quality']
ind_vars = data.drop(labels='quality',axis=1)

# Setting up model
ind_vars = sm.add_constant(ind_vars) # Adding an intercept term
model = sm.OLS(dep_var,ind_vars).fit()

# Showing the summary
print(model.summary())

In [None]:
# Finding the Significant Column
significant_columns = list(model.pvalues[model.pvalues<.05].keys())
[print(i) for i in significant_columns]
significant_columns.append('quality')

# Pairwise plots just for significant effects
sns.pairplot(data[significant_columns],kind='reg',hue='quality')
plt.show()

***Logistic regression***

In [None]:
# Establishing a thresh point
binthresh = np.mean(data['quality'])
print(binthresh)

# Binarizing wine quality
data['binquality'] = data['quality']>binthresh
# print(data)

In [None]:
# List of all relevant columns to be used in regression
Xcols = []
for key in data.keys():
  if key not in ['quality','binquality']:
    Xcols.append(key)
print(Xcols)

In [None]:
# Conducting the regression
model = sm.Logit(data['binquality'],data[Xcols])
results = model.fit(method='newton')

# Summarizing the output
print(results.summary())

In [None]:
# Finding the Significant Column
significant_columnsL = list(results.pvalues[results.pvalues<.05].keys())
[print(i) for i in significant_columnsL]

In [None]:
# Printing significant predictors from both regression
print("Significant predictors from standard regression:")
[print("  "+i) for i in significant_columns]

print(' ')
print("Significant predictors from logistic regression:")
[print("  "+i) for i in significant_columnsL[:-1]];

# **Transforming to Gaussian**

In [None]:
n = 500

x = np.cumsum(np.random.randn(n))

y = (stats.rankdata(x)/(n+1) - .5 )*2
y = np.arctanh(y)
print(np.min(y),np.max(y))

In [None]:
fig,ax = plt.subplots(2,2,figsize=(9,7))

ax[0,0].plot(x)
ax[0,0].set_title('Original data')
ax[0,1].plot(y)
ax[0,1].set_title('Transformed data')

ax[1,0].hist(x,bins=40)
ax[1,0].set_title('Original data')
ax[1,1].hist(y,bins=40)
ax[1,1].set_title('Transformed data')

plt.show()

In [None]:
plt.plot(x,y,'s')
plt.xlabel('Original')
plt.ylabel('Trasformed')
plt.show()