In [1]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier, BaggingClassifier, GradientBoostingClassifier, RandomForestClassifier

In [9]:
data = pd.read_csv('/content/file_02.csv')

In [10]:
data

Unnamed: 0,index,Date,Region,Thermal Generation Actual (in MU),Thermal Generation Estimated (in MU),Nuclear Generation Actual (in MU),Nuclear Generation Estimated (in MU),Hydro Generation Actual (in MU),Hydro Generation Estimated (in MU)
0,0,2017-09-01,Northern,624.23,484.21,30.36,35.57,273.27,320.81
1,1,2017-09-01,Western,1106.89,1024.33,25.17,3.81,72.00,21.53
2,2,2017-09-01,Southern,576.66,578.55,62.73,49.80,111.57,64.78
3,3,2017-09-01,Eastern,441.02,429.39,,,85.94,69.36
4,4,2017-09-01,NorthEastern,29.11,15.91,,,24.64,21.21
...,...,...,...,...,...,...,...,...,...
4940,305,2020-08-01,Northern,669.47,602.96,26.88,23.41,348.72,351.98
4941,306,2020-08-01,Western,1116.00,1262.10,42.37,36.63,54.67,20.28
4942,307,2020-08-01,Southern,494.66,415.53,61.83,26.28,93.49,77.25
4943,308,2020-08-01,Eastern,482.86,547.03,,,87.22,93.78


In [11]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4945 entries, 0 to 4944
Data columns (total 9 columns):
 #   Column                                Non-Null Count  Dtype  
---  ------                                --------------  -----  
 0   index                                 4945 non-null   int64  
 1   Date                                  4945 non-null   object 
 2   Region                                4945 non-null   object 
 3   Thermal Generation Actual (in MU)     4945 non-null   object 
 4   Thermal Generation Estimated (in MU)  4945 non-null   object 
 5   Nuclear Generation Actual (in MU)     2967 non-null   float64
 6   Nuclear Generation Estimated (in MU)  2967 non-null   float64
 7   Hydro Generation Actual (in MU)       4945 non-null   float64
 8   Hydro Generation Estimated (in MU)    4945 non-null   float64
dtypes: float64(4), int64(1), object(4)
memory usage: 347.8+ KB


In [12]:
##Dropping Index Column

In [13]:
data = data.drop('index', axis=1)

In [14]:
data.isna().mean()

Date                                    0.0
Region                                  0.0
Thermal Generation Actual (in MU)       0.0
Thermal Generation Estimated (in MU)    0.0
Nuclear Generation Actual (in MU)       0.4
Nuclear Generation Estimated (in MU)    0.4
Hydro Generation Actual (in MU)         0.0
Hydro Generation Estimated (in MU)      0.0
dtype: float64

In [15]:
for column in ['Nuclear Generation Actual (in MU)', 'Nuclear Generation Estimated (in MU)']:
    data[column] = data[column].fillna(data[column].mean())

In [16]:
print("Total missing values:", data.isna().sum().sum())

Total missing values: 0


In [17]:
##Creating Year and Month Column

In [18]:
data

Unnamed: 0,Date,Region,Thermal Generation Actual (in MU),Thermal Generation Estimated (in MU),Nuclear Generation Actual (in MU),Nuclear Generation Estimated (in MU),Hydro Generation Actual (in MU),Hydro Generation Estimated (in MU)
0,2017-09-01,Northern,624.23,484.21,30.360000,35.570000,273.27,320.81
1,2017-09-01,Western,1106.89,1024.33,25.170000,3.810000,72.00,21.53
2,2017-09-01,Southern,576.66,578.55,62.730000,49.800000,111.57,64.78
3,2017-09-01,Eastern,441.02,429.39,37.242208,36.987877,85.94,69.36
4,2017-09-01,NorthEastern,29.11,15.91,37.242208,36.987877,24.64,21.21
...,...,...,...,...,...,...,...,...
4940,2020-08-01,Northern,669.47,602.96,26.880000,23.410000,348.72,351.98
4941,2020-08-01,Western,1116.00,1262.10,42.370000,36.630000,54.67,20.28
4942,2020-08-01,Southern,494.66,415.53,61.830000,26.280000,93.49,77.25
4943,2020-08-01,Eastern,482.86,547.03,37.242208,36.987877,87.22,93.78


In [19]:
data['Year'] = data['Date'].apply(lambda x: np.int(x[0:4]))
data['Month'] = data['Date'].apply(lambda x: np.int(x[5:7]))

data = data.drop('Date', axis=1)

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  """Entry point for launching an IPython kernel.
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  


In [20]:
##Removing Commas From Thermal Columns

In [21]:
for column in ['Thermal Generation Actual (in MU)', 'Thermal Generation Estimated (in MU)']:
    data[column] = data[column].apply(lambda x: np.float(x.replace(',', '')))

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  


In [22]:
data

Unnamed: 0,Region,Thermal Generation Actual (in MU),Thermal Generation Estimated (in MU),Nuclear Generation Actual (in MU),Nuclear Generation Estimated (in MU),Hydro Generation Actual (in MU),Hydro Generation Estimated (in MU),Year,Month
0,Northern,624.23,484.21,30.360000,35.570000,273.27,320.81,2017,9
1,Western,1106.89,1024.33,25.170000,3.810000,72.00,21.53,2017,9
2,Southern,576.66,578.55,62.730000,49.800000,111.57,64.78,2017,9
3,Eastern,441.02,429.39,37.242208,36.987877,85.94,69.36,2017,9
4,NorthEastern,29.11,15.91,37.242208,36.987877,24.64,21.21,2017,9
...,...,...,...,...,...,...,...,...,...
4940,Northern,669.47,602.96,26.880000,23.410000,348.72,351.98,2020,8
4941,Western,1116.00,1262.10,42.370000,36.630000,54.67,20.28,2020,8
4942,Southern,494.66,415.53,61.830000,26.280000,93.49,77.25,2020,8
4943,Eastern,482.86,547.03,37.242208,36.987877,87.22,93.78,2020,8


**Encoding Labels**

In [23]:
label_encoder = LabelEncoder()

data['Region'] = label_encoder.fit_transform(data['Region'])

In [24]:
data

Unnamed: 0,Region,Thermal Generation Actual (in MU),Thermal Generation Estimated (in MU),Nuclear Generation Actual (in MU),Nuclear Generation Estimated (in MU),Hydro Generation Actual (in MU),Hydro Generation Estimated (in MU),Year,Month
0,2,624.23,484.21,30.360000,35.570000,273.27,320.81,2017,9
1,4,1106.89,1024.33,25.170000,3.810000,72.00,21.53,2017,9
2,3,576.66,578.55,62.730000,49.800000,111.57,64.78,2017,9
3,0,441.02,429.39,37.242208,36.987877,85.94,69.36,2017,9
4,1,29.11,15.91,37.242208,36.987877,24.64,21.21,2017,9
...,...,...,...,...,...,...,...,...,...
4940,2,669.47,602.96,26.880000,23.410000,348.72,351.98,2020,8
4941,4,1116.00,1262.10,42.370000,36.630000,54.67,20.28,2020,8
4942,3,494.66,415.53,61.830000,26.280000,93.49,77.25,2020,8
4943,0,482.86,547.03,37.242208,36.987877,87.22,93.78,2020,8


In [25]:
data.dtypes

Region                                    int64
Thermal Generation Actual (in MU)       float64
Thermal Generation Estimated (in MU)    float64
Nuclear Generation Actual (in MU)       float64
Nuclear Generation Estimated (in MU)    float64
Hydro Generation Actual (in MU)         float64
Hydro Generation Estimated (in MU)      float64
Year                                      int64
Month                                     int64
dtype: object

**Splitting/Scaling**

In [26]:
y = data['Region'].copy()
X = data.drop('Region', axis=1).copy()

In [27]:
scaler = StandardScaler()

X = scaler.fit_transform(X)

In [28]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7)

***Modeling**

In [29]:
models = [
    LogisticRegression(),
    SVC(),
    MLPClassifier(),
    DecisionTreeClassifier(),
    AdaBoostClassifier(),
    BaggingClassifier(),
    GradientBoostingClassifier(),
    RandomForestClassifier()
]

model_names = [
    "         Logistic Regression",
    "      Support Vector Machine",
    "              Neural Network",
    "               Decision Tree",
    "         AdaBoost Classifier",
    "          Bagging Classifier",
    "Gradient Boosting Classifier",
    "    Random Forest Classifier"
]

In [30]:
results = []

for i in range(len(models)):
    models[i].fit(X_train, y_train)
    results.append(models[i].score(X_test, y_test))

In [31]:
##results

In [32]:
for i in range(len(models)):
    print(model_names[i] + ": {:.5f}".format(results[i]))

         Logistic Regression: 0.99865
      Support Vector Machine: 1.00000
              Neural Network: 1.00000
               Decision Tree: 0.99933
         AdaBoost Classifier: 0.57951
          Bagging Classifier: 1.00000
Gradient Boosting Classifier: 0.99933
    Random Forest Classifier: 1.00000


i think model overfits, leave