<a href="https://colab.research.google.com/github/B0BWAX/ML-OLY-SUSTAINABILTY-PREDICTION/blob/main/PREPROCESSING_AND_TRAINING.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# mount drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
# using kaggle API
!mkdir ~/.kaggle
!cp /content/drive/MyDrive/API-KEYS/kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

In [3]:
# download dataset
!kaggle competitions download -c ml-olympiad-sustainable-urban-living

Downloading ml-olympiad-sustainable-urban-living.zip to /content
  0% 0.00/877k [00:00<?, ?B/s]
100% 877k/877k [00:00<00:00, 68.5MB/s]


In [4]:
!unzip ml-olympiad-sustainable-urban-living.zip

Archive:  ml-olympiad-sustainable-urban-living.zip
  inflating: sample_submission.csv   
  inflating: test.csv                
  inflating: train.csv               


In [6]:
import pandas as pd
import numpy as np

In [7]:
data = pd.read_csv('/content/train.csv')
data.head()

Unnamed: 0,Id,Property_Type,Property_Area,Number_of_Windows,Number_of_Doors,Furnishing,Frequency_of_Powercuts,Power_Backup,Water_Supply,Traffic_Density_Score,Crime_Rate,Dust_and_Noise,Air_Quality_Index,Neighborhood_Review,Habitability_score
0,0x21e3,Apartment,106,,1,Semi_Furnished,0.0,No,Once in a day - Morning,5.89,Slightly below average,Medium,90.0,3.86,71.98
1,0x68d4,Apartment,733,2.0,2,Unfurnished,1.0,No,Once in a day - Evening,4.37,Well below average,Medium,96.0,3.55,71.2
2,0x7d81,Apartment,737,4.0,2,Fully Furnished,0.0,No,Once in a day - Morning,7.45,Slightly below average,Medium,121.0,3.81,71.39
3,0x7a57,Apartment,900,3.0,2,Unfurnished,2.0,Yes,Once in a day - Morning,6.16,Well above average,Medium,100.0,1.34,31.46
4,0x9409,Bungalow,2238,14.0,6,Fully Furnished,0.0,No,All time,5.46,Well below average,Medium,116.0,4.77,93.7


### Preprocessing

In [14]:
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler

In [10]:
# Check for NaN values in the entire dataset
nan_values = data.isna().sum()
# Print columns with NaN values and the corresponding count of NaN values
print("Columns with NaN values:")
print(nan_values[nan_values > 0])

Columns with NaN values:
Number_of_Windows         1333
Furnishing                 828
Frequency_of_Powercuts    1109
Crime_Rate                 629
Dust_and_Noise             999
dtype: int64


In [12]:
# Imputing missing values for Number_of_Windows and Frequency_of_Powercuts
median_value = data['Number_of_Windows'].median()

data['Number_of_Windows'].fillna(median_value, inplace=True)

data['Frequency_of_Powercuts'].fillna(median_value, inplace=True)

data = data.dropna() # Drop other rows with Nan values

In [13]:
# One-Hot Encoding with pandas.get_dummies
categorical_cols = ['Property_Type', 'Furnishing', 'Power_Backup', 'Water_Supply', 'Dust_and_Noise','Crime_Rate' ]
data = pd.get_dummies(data, columns=categorical_cols)
data.drop(columns=['Id'], inplace=True)

In [16]:
X = data.drop(columns="Habitability_score")
y = data["Habitability_score"]

In [17]:
# Scaling features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

### Model Training

In [21]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score,GridSearchCV
from sklearn.svm import SVR
import pickle

In [20]:
# Define the parameter grid for GridSearchCV
param_grid = {
    'kernel': ['linear', 'rbf', 'poly'],  # Different kernel types
    'C': [0.1, 1, 10],  # Regularization parameter
    'degree': [2, 3],  # Degree for polynomial kernel (if applicable)
    'gamma': ['auto', 0.1, 1]  # Gamma for RBF kernel (if applicable)
}

In [22]:
svr = SVR()

n_splits = 7
kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
grid_search = GridSearchCV(svr, param_grid, scoring='neg_mean_squared_error', cv=n_splits)

In [23]:
scores = []
for train_index, test_index in kf.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    # Fit scaler to the training features and transform both training and testing features
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # Fit SVR model to the scaled training data
    svr.fit(X_train_scaled, y_train)

    # Evaluate SVR model on the scaled testing data
    score = svr.score(X_test_scaled, y_test)
    scores.append(score)

with open('ML_OLY_SVR_Model.pkl', 'wb') as f: # Save model
    pickle.dump(svr, f)

In [24]:
# Print the cross-validation scores
print("Cross-validation scores:", scores)

# Print the mean and standard deviation of the cross-validation scores
print("Mean cross-validation score:", np.mean(scores))
print("Standard deviation of cross-validation scores:", np.std(scores))

Cross-validation scores: [0.7221331959651303, 0.7274151417731147, 0.7367891625780398, 0.7309521999212302, 0.7234905195036307, 0.717244013391214, 0.7147301242854679]
Mean cross-validation score: 0.7246791939168326
Standard deviation of cross-validation scores: 0.0071278959466900054
