<a href="https://colab.research.google.com/github/Aryaan790/C-Code/blob/main/CS_4372_HW1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Import the libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import statsmodels.api as sm
from sklearn.metrics import r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import SGDRegressor
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import GridSearchCV
from statsmodels.sandbox.regression.predstd import wls_prediction_std

In [None]:
# Load the data
df = pd.read_csv("/content/winequality-red.csv")
df.head()

In [None]:
df.info

In [None]:
# Check for null values
df.isnull().sum()

In [None]:
# Exploratory Data Analysis

In [None]:
# Plot various graphs to find quality of wine
df.plot(x='alcohol',y='quality',style='.',color='r')
plt.title('alcohol vs quality')
plt.xlabel('alcohol')
plt.ylabel('quality')
plt.grid()
plt.show()

In [None]:
df.plot(x='free sulfur dioxide',y='total sulfur dioxide', style='.',color='g')
plt.title('free sulfur dioxide vs total sulfur dioxide')
plt.xlabel('free sulfur dioxide')
plt.ylabel('total sulfur dioxide')
plt.grid()
plt.show()

In [None]:
# Data is easily scattered on features
df.hist(bins=20, figsize=(10, 10))
plt.show()

In [None]:
# Checking which value of citric acid can make changes in quality
sns.catplot(data=df, kind="bar",x="quality",y="citric acid",palette="pastel",alpha=.5, height=5)

In [None]:
sns.boxplot(x="quality",y="citric acid",data=df)

In [None]:
sns.boxplot(x="quality",y="fixed acidity",data=df)

In [None]:
# Observe correlation matrix
correlation_matrix = df.corr().round(2)
sns.heatmap(data=correlation_matrix, annot=True)

In [None]:
# Define X and Y
df['winequality']=[1 if x>=6 else 0 for x in df['quality']]
X=df.drop(['quality','winequality'],axis = 1)
y=df['winequality']
n = y.shape[0]

# separate the data into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

# Scale the data
mm = MinMaxScaler()
fit=mm.fit(X_train)
X_train=fit.transform(X_train)
X_test=fit.transform(X_test)

In [None]:
# SGDRegressor
# train the model using the training set
max_iter = (int)(np.ceil(10**6 / n))
reg_model = SGDRegressor(max_iter=max_iter)
reg_model.fit(X_train, y_train)

# predict the target variable using the test set and the trained model
y_pred_train_sgd = reg_model.predict(X_train)
y_pred_test_sgd = reg_model.predict(X_test)

In [None]:
# Grid search - this will take about 1 minute.
param_grid = {
    'alpha': 10.0 ** -np.arange(1, 7),
    'loss': ['squared_error', 'huber', 'epsilon_insensitive', 'squared_epsilon_insensitive'],
    'penalty': ['l2', 'l1', 'elasticnet'],
    'learning_rate': ['constant', 'optimal', 'invscaling'],
    'tol':[1e-10, 1e-3],
    'eta0':[0.001, 0.01]
}

# Hyperparameter Tuning
clf = GridSearchCV(reg_model, param_grid, n_jobs = 8, verbose = 3)
clf.fit(X_train, y_train)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
print('Best Score: ', clf.best_score_)
print('Best Parameters: ', clf.best_params_)
print("Best Estimator: ",  clf.best_estimator_)
#print("Scores for alphas: ", clf.grid_scores_)

In [None]:
# model evaluation for training set
rmse = (np.sqrt(mean_squared_error(y_train, y_pred_train_sgd)))
r2 = r2_score(y_train, y_pred_train_sgd)

print("The model performance for training set")
print("--------------------------------------")
print('RMSE is {}'.format(rmse))
print('R2 score is {}'.format(r2))
print("\n")

# model evaluation for testing set
rmse = (np.sqrt(mean_squared_error(y_test, y_pred_test_sgd)))
r2 = r2_score(y_test, y_pred_test_sgd)

print("The model performance for testing set")
print("--------------------------------------")
print('RMSE is {}'.format(rmse))
print('R2 score is {}'.format(r2))

In [None]:
# OLS
# train the model using the training set
reg_model = LinearRegression()
reg_model.fit(X_train, y_train)

# predict the target variable using the test set and the trained model
y_pred_train = reg_model.predict(X_train)
y_pred_test = reg_model.predict(X_test)

In [None]:
# model evaluation for training set
rmse = (np.sqrt(mean_squared_error(y_train, y_pred_train)))
r2 = r2_score(y_train, y_pred_train)

print("The model performance for training set")
print("--------------------------------------")
print('RMSE is {}'.format(rmse))
print('R2 score is {}'.format(r2))
print("\n")

# model evaluation for testing set
rmse = (np.sqrt(mean_squared_error(y_test, y_pred_test)))
r2 = r2_score(y_test, y_pred_test)

print("The model performance for testing set")
print("--------------------------------------")
print('RMSE is {}'.format(rmse))
print('R2 score is {}'.format(r2))

In [None]:
# Ordinary Least Square (OLS) Principle
x = sm.add_constant(X_train)
model = sm.OLS(y_train, x).fit()
print(model.summary())

In [None]:
ypred = model.predict(x)
rmse = sm.tools.eval_measures.rmse(y_train, ypred)
print(rmse)

In [None]:
# Plot results from linear model and SGD
plt.scatter(y_test, y_pred, label='LR')
plt.scatter(y_test, y_pred_test_sgd, label='SGD')
plt.xlabel("Prediction: $\hat{Y}_i$")
plt.ylabel("Actual: $Y_i$")
plt.title("Actual vs Predicted")
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0.) # Place a legend to the right of this smaller subplot.
plt.show()