<a href="https://colab.research.google.com/github/Bookky123/Big-Data-/blob/main/Final_Project_tools.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install opendatasets -q

# **Import library**

In [None]:
from os.path import split
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from imblearn.over_sampling import SMOTE
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

##download data from kaggle

In [None]:
import opendatasets as od
od.download('https://www.kaggle.com/datasets/uciml/red-wine-quality-cortez-et-al-2009')

In [None]:
file = ('/content/red-wine-quality-cortez-et-al-2009/winequality-red.csv')
df = pd.read_csv(file)
df.sample(5)

In [None]:
df.info()

# **Data Preprocessing**

## missing value

In [None]:
df.isnull().sum()

In [None]:
columns = list(df.columns)
columns

## visualization & plots

### heatmap

In [None]:
corr = df.corr()
plt.figure(figsize=(10,8))
sns.heatmap(corr, annot=True, cmap='coolwarm', fmt='.2f', vmin=-1, vmax=1)
plt.title('Correlation Matrix')

### Scatterplot & Boxplot

In [None]:
fig, ax = plt.subplots(11, 2, figsize=(16,45))
plt.subplots_adjust(hspace=0.5)
for i in range(11):
    sns.boxplot(x=columns[i], data=df, ax=ax[i,0])
    sns.scatterplot(x=columns[i], y='quality',data=df, ax=ax[i,1],hue='quality')

### Target categorizing

In [None]:
df.quality.unique()

In [None]:
df = df.replace({'quality' : {
    3 : 'bad',
    4 : 'bad',
    5 : 'middle',
    6 : 'middle',
    7 : 'good',
    8 : 'good',
}})

In [None]:
df.head(5)

### Normalization

In [None]:
X = df.drop(columns='quality')
y = df.quality

In [None]:
df.describe()

# Modeling

In [None]:
df.quality.value_counts()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=42)

## SMOTE

In [None]:
#เเยกตัวที่ต้องการ ปรับสมดุล ในกรณีนี้เราสนใจ good&middle
X_train_gm = X_train[y_train.isin(['good','middle'])]
y_train_gm = y_train[y_train.isin(['good','middle'])]

#ใช้ smote ปรับสมดุล good&middle
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_train_gm, y_train_gm)

In [None]:
#รวมข้อมูลที่ปรับสมดุลเเล้ว กับ bad
X_train_bad = X_train[y_train == 'bad']
y_train_bad = y_train[y_train == 'bad']

X_train_resampled = pd.concat([X_resampled, X_train_bad])
y_train_resampled = pd.concat([y_resampled, y_train_bad])
#ข้อมูลถูกปรับสมดุลเเล้ว

## RandomForestClassifier

In [None]:
parameters = {
    'n_estimators' : [50,150,500],
    'criterion' : ['gini','entropy'],
    'max_features' : ['sqrt','log2']
}

rfc = RandomForestClassifier(n_jobs=-1)
rfc_cv = GridSearchCV(estimator=rfc, param_grid=parameters, cv=5).fit(X_train_resampled, y_train_resampled)

print(rfc_cv.best_params_)
print('accuracy_score :' ,rfc_cv.best_score_)

In [None]:
rf = RandomForestClassifier(**rfc_cv.best_params_).fit(X_train_resampled, y_train_resampled)

#Predict model
y_pred_rf = rf.predict(X_test)

rf_score = round(rf.score(X_test,y_test),3)
print('RandomForestClassifier Score : ',rf_score)

In [None]:
y_test.value_counts()

In [None]:
print(classification_report(y_test, y_pred_rf))

In [None]:
print(confusion_matrix(y_test, y_pred_rf))

### Bad & Not Bad

เเยกทำนาย ค่า Bad เพราะ ในโมเดล ไม่สามารถทำนายค่าBad ได้เลย

In [None]:
#สร้าง columns is_bad
df['is_bad'] = df['quality'].apply(lambda x: 1 if x == 'bad' else 0)

#เเบ่ง X,y
X_bad = df.drop(columns=['quality', 'is_bad'], axis=1)
y_bad = df['is_bad']

In [None]:
X_train_bad, X_test_bad, y_train_bad, y_test_bad = train_test_split(X_bad,y_bad, test_size=0.2, random_state=42)

In [None]:
#สร้าง Model is bad
rf_bad = RandomForestClassifier(**rfc_cv.best_params_).fit(X_train_bad, y_train_bad)
rf_bad.fit(X_train_bad, y_train_bad)

In [None]:
# predict
y_pred_bad = rf_bad.predict(X_test_bad)

In [None]:
print(classification_report(y_test_bad, y_pred_bad))

In [None]:
print(confusion_matrix(y_test_bad, y_pred_bad))

##  KNeighborsClassifier

In [None]:
parameters = {
    'n_neighbors' : list(np.arange(3,50,2)),
    'weights' : ['uniform', 'distance'],
    'p' : [1,2,3,4]
}

knn = KNeighborsClassifier()
knn_cv = GridSearchCV(estimator=knn, param_grid=parameters, cv=10).fit(X_train_resampled,y_train_resampled)

print(knn_cv.best_params_)
print('accurracy ',knn_cv.best_score_)

In [None]:
knn = KNeighborsClassifier(**knn_cv.best_params_).fit(X_train_resampled, y_train_resampled)

In [None]:
y_pred_knn = knn_cv.predict(X_test)

knn_score = round(knn_cv.score(X_test,y_test), 3)
print('KNeighborsClassifier Score : ',knn_score)

In [None]:
print(classification_report(y_test, y_pred_knn))

In [None]:
print(confusion_matrix(y_test, y_pred_knn))

## SVC

In [None]:
parameters = {
    'C' : [0.001, 0.01, 0.1, 1, 10, 100, 1000],
    'gamma' : [0.001, 0.01, 0.1, 1, 10, 100, 1000]
}

svc = SVC()
svc_cv = GridSearchCV(estimator=svc, param_grid=parameters, cv=10).fit(X_train_resampled,y_train_resampled)

print('Turned hyper parameter : ', svc_cv.best_params_)
print('accurracy score : ', svc_cv.best_score_)

In [None]:
svc = SVC(**svc_cv.best_params_).fit(X_train_resampled, y_train_resampled)

In [None]:
y_pred_svc = svc_cv.predict(X_test)

svc_score = round(svc.score(X_test, y_test),3)
print('SVC Score : ', svc_score)

In [None]:
print(classification_report(y_test, y_pred_svc))

In [None]:
print(confusion_matrix(y_test, y_pred_svc))

# Result

In [None]:
result = pd.DataFrame({
    'Model' : ['RandomForestClassifier', 'KNeighborsClassifier','SVC'],
    'Score' : [rf_score, knn_score, svc_score]
})

result.sort_values(by='Score', inplace=True, ascending=True)

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(15,5))

sns.barplot(x='Model',y='Score', data=result)
ax.bar_label(ax.containers[0])
ax.set_xticklabels(labels=result.Model, rotation=300)
plt.show()

**The Best Score : RandomForestClassifier**

In [None]:
rfc = RandomForestClassifier(**rfc_cv.best_params_)

In [None]:
rfc.fit(X,y)