In [1]:
import numpy as np
import scipy #only sparse matrices allowed
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

In [3]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from lightgbm import LGBMClassifier
from sklearn.neural_network import MLPClassifier
from xgboost.sklearn import XGBClassifier
from numpy import genfromtxt
from sklearn.ensemble import StackingClassifier
from sklearn.model_selection import StratifiedKFold

In [5]:
#https://stackoverflow.com/questions/3518778/how-do-i-read-csv-data-into-a-record-array-in-numpy
X_train_full_ada_std = genfromtxt('./without_dups/X_train_full_ada_std.csv', delimiter=',')
X_train_full_std = genfromtxt('./without_dups/X_train_full_std.csv', delimiter=',')

y_train_full_ada = genfromtxt('./without_dups/y_train_full_ada.csv', delimiter=',')
y_train_full = genfromtxt('./without_dups/y_train_full.csv', delimiter=',')

y_train_full_ada = y_train_full_ada.astype(int)
y_train_full = y_train_full.astype(int)

In [76]:
X_train_full_ada_std.shape

(51470, 21)

In [77]:
y_train_full_ada.shape

(51470,)

In [78]:
X_train_full_std.shape

(22650, 21)

In [79]:
y_train_full.shape

(22650,)

# ADA based classifiers:

In [80]:
estimators_ada = [
    ('knn', KNeighborsClassifier(weights = 'distance', p = 1, n_neighbors = 5, metric = 'minkowski', leaf_size = 45, algorithm = 'auto')),
    ('lgbm', LGBMClassifier(reg_lambda= 0.0, reg_alpha= 0.001, objective= 'multiclass', num_leaves= 64, n_estimators= 200, max_depth = -1, importance_type= 'split', class_weight= 'balanced', boosting_type= 'gbdt')),
    ('logreg', LogisticRegression(solver = 'sag', max_iter = 100000, class_weight = 'balanced', C = 2.0)),
    ('mlp', MLPClassifier(solver = 'adam', max_iter = 400, learning_rate = 'constant', alpha = 0.1, activation = 'logistic')),
    ('rf', RandomForestClassifier(n_estimators = 300, max_features = 'log2', max_depth = None, criterion = 'entropy', class_weight = 'balanced')),
    ('sgd', SGDClassifier(tol = 0.01, penalty = 'l2', max_iter = 10000, class_weight = 'balanced', alpha = 0.01)),
    ('gbc', GradientBoostingClassifier(n_estimators = 200, max_features = 'log2', max_depth = 9, learning_rate = 0.1)),
    ('dt', DecisionTreeClassifier(splitter = 'random', min_samples_split = 2, max_features = None, max_depth = None, criterion = 'entropy', class_weight = 'balanced')),
    ('svc', SVC(kernel = 'rbf', gamma = 'auto', degree = 10, class_weight = 'balanced', C = 1.5)),
    ('xgb', XGBClassifier(use_label_encoder = False, skip_drop = 0.2, sample_type = 'uniform', rate_drop = 0.0, one_drop = 0, normalize_type = 'forest', booster = 'dart'))
]

In [81]:
final_estimator_ada = GradientBoostingClassifier()
final_clf_ada = StackingClassifier(
    estimators=estimators_ada,
    final_estimator=final_estimator_ada)

In [82]:
final_clf_ada.fit(X_train_full_ada_std, y_train_full_ada)



StackingClassifier(estimators=[('knn',
                                KNeighborsClassifier(leaf_size=45, p=1,
                                                     weights='distance')),
                               ('lgbm',
                                LGBMClassifier(class_weight='balanced',
                                               n_estimators=200, num_leaves=64,
                                               objective='multiclass',
                                               reg_alpha=0.001)),
                               ('logreg',
                                LogisticRegression(C=2.0,
                                                   class_weight='balanced',
                                                   max_iter=100000,
                                                   solver='sag')),
                               ('mlp',
                                MLPClassifier(activation='logistic', a...
                                              missing=nan,
  

In [63]:
#final_clf_ada.score(X_test_ada_std, y_test_ada)

0.9783732317736671

# Non-ADA classifiers:

In [83]:
estimators_non_ada = [
    ('knn', KNeighborsClassifier(weights = 'distance', p = 1, n_neighbors = 5, metric = 'minkowski', leaf_size = 30, algorithm = 'auto')),
    ('lgbm', LGBMClassifier(reg_lambda= 1.0, reg_alpha= 0.001, num_leaves= 64, n_estimators= 200, max_depth = 8, importance_type= 'gain', class_weight= None, boosting_type= 'gbdt')),
    ('logreg', LogisticRegression(solver = 'newton-cg', max_iter = 1000, class_weight = None, C = 2.0)),
    ('mlp', MLPClassifier(solver = 'adam', max_iter = 400, learning_rate = 'constant', alpha = 0.1, activation = 'relu')),
    ('rf', RandomForestClassifier(n_estimators = 300, max_features = 'sqrt', max_depth = None, criterion = 'entropy', class_weight = None)),
    ('sgd', SGDClassifier(tol = 0.01, penalty = 'l1', max_iter = 500, class_weight = None, alpha = 0.001)),
    ('gbc', GradientBoostingClassifier(n_estimators = 200, max_features = 'log2', max_depth = 9, learning_rate = 0.1)),
    ('dt', DecisionTreeClassifier(splitter = 'best', min_samples_split = 2, max_features = None, max_depth = None, criterion = 'entropy', class_weight = 'balanced')),
    ('svc', SVC(kernel = 'rbf', gamma = 'scale', degree = 5, class_weight = None , C = 2.0)),
]

In [84]:
final_estimator_non_ada = GradientBoostingClassifier()
final_clf_non_ada = StackingClassifier(
    estimators=estimators_non_ada,
    final_estimator=final_estimator_non_ada)

In [85]:
final_clf_non_ada.fit(X_train_full_std, y_train_full)

KeyboardInterrupt: 

In [None]:
#final_clf_non_ada.score(X_test_std, y_test)

In [86]:
def df_year_handler(df_train):
    #df_train is the train df loaded from our csv file
    #labels are not dealt with here, so keep them in a separate column before calling this
    #df_train2 = df_train.drop(columns=["S.No"])
    df_train2 = df_train
    #df_train2 = df_train
    #https://stackoverflow.com/questions/22005911/convert-columns-to-string-in-pandas
    #converting to string for the split
    df_train2["time"] = df_train2["time"].astype(str)
    #expand will return multiple columns with the split strings
    split_columns = df_train2["time"].str.split("",expand=True)
    year = split_columns[1] + split_columns[2] + split_columns[3] +split_columns[4]
    month = split_columns[5] + split_columns[6]
    date = split_columns[7] + split_columns[8]
    
    #converting series to df and renaming the column so we dont end up with a bunch of columns named 0
    #https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.rename.html
    year = pd.DataFrame(year)
    month = pd.DataFrame(month)
    date = pd.DataFrame(date)

    year = year.rename(columns={0: "year"})
    month = month.rename(columns={0: "month"})
    date = date.rename(columns={0: "date"})

    #we need to convert the strings back to integers
    #https://datatofish.com/string-to-integer-dataframe/
    year['year'] = year['year'].astype(int)
    month['month'] = month['month'].astype(int)
    date['date'] = date['date'].astype(int)
    
    #adding these back to the original df
    df = pd.concat([df_train2, year, month, date], axis = 1)
    df = df.drop(columns=["time"])
    #df.head()
    
    return df

In [87]:
submit_df = pd.read_csv(r"./ift3395-6390-weatherevents/test.csv")

In [88]:
submit_df = df_year_handler(submit_df)

In [89]:
# #https://datascience.stackexchange.com/questions/13567/ways-to-deal-with-longitude-latitude-feature
# #https://heartbeat.comet.ml/working-with-geospatial-data-in-machine-learning-ad4097c7228d
# #https://stackoverflow.com/questions/14059094/i-want-to-multiply-two-columns-in-a-pandas-dataframe-and-add-the-result-into-a-n
# submit_df['cos_lat'] = np.cos(submit_df[['lat']])
# submit_df['cos_long'] = np.cos(submit_df[['lon']])
# submit_df['sin_lat'] = np.sin(submit_df[['lat']])
# submit_df['sin_long'] = np.sin(submit_df[['lon']])

# submit_df['x'] = submit_df['cos_lat'] * submit_df['cos_long']
# submit_df['y'] = submit_df['cos_lat'] * submit_df['sin_long']
# submit_df['z'] = submit_df['sin_lat']

# #submit_df = submit_df.drop(columns=['cos_lat','cos_long','sin_lat','sin_long', 'lat', 'lon'])
# submit_df = submit_df.drop(columns=['cos_lat','cos_long','sin_lat','sin_long'])


# # submit_df["x"] = np.cos(submit_df[['lat']]) * np.cos(submit_df[['lon']])
# # submit_df["y"] = np.cos(submit_df[['lat']]) * np.sin(submit_df[['lon']])
# # submit_df["z"] = np.sin(submit_df[['lat']])

In [90]:
submit_df = submit_df.drop(columns=["S.No"])

In [91]:
submit_df

Unnamed: 0,lat,lon,TMQ,U850,V850,UBOT,VBOT,QREFHT,PS,PSL,...,T500,PRECT,TS,TREFHT,Z1000,Z200,ZBOT,year,month,date
0,-31.095176,353.1250,22.372244,-1.994936,-0.002513,-7.732825,-1.864900,0.008203,102809.6719,102809.6719,...,258.135406,4.060000e-08,290.377930,289.014862,234.634003,12023.84082,63.400764,2013,9,27
1,-31.095176,353.4375,21.536032,-2.058918,0.397359,-8.436489,-0.940275,0.007887,102816.6016,102816.6016,...,258.205994,1.670000e-08,290.377228,288.873138,234.982056,12023.83301,63.348942,2013,9,27
2,-31.095176,353.7500,20.465317,-2.128485,0.717785,-8.842417,0.152449,0.007695,102827.8594,102827.8594,...,258.242279,1.220000e-08,290.377228,288.641480,235.631378,12025.41113,63.281322,2013,9,27
3,-31.095176,354.0625,19.967148,-2.126235,1.123356,-8.773702,1.251795,0.007519,102830.5391,102830.5391,...,258.335113,3.460000e-10,290.441406,288.420074,235.608460,12026.61426,63.216827,2013,9,27
4,-31.095176,354.3750,19.598663,-2.053118,1.823581,-8.472447,2.225254,0.007293,102823.8516,102823.8516,...,258.513702,5.920000e-10,290.468903,288.392975,235.004334,12027.29004,63.200485,2013,9,27
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7315,24.054759,276.5625,51.415295,-1.095974,-8.194263,2.484773,-10.520496,0.020928,100743.8438,100743.8438,...,268.880676,6.870000e-15,302.576569,302.226227,66.832077,12467.79492,66.832077,2013,9,16
7316,24.054759,276.8750,52.377407,-0.265653,-8.730537,3.783044,-10.748092,0.021389,100703.5313,100703.5313,...,269.085083,9.240000e-10,302.583557,302.131012,66.827492,12466.41309,66.827492,2013,9,16
7317,24.054759,277.1875,54.639217,0.775797,-9.646189,5.087689,-10.786784,0.021745,100673.9844,100673.9844,...,269.305939,5.870000e-08,302.646820,302.032715,66.812981,12467.56934,66.812981,2013,9,16
7318,24.054759,277.5000,56.121231,1.813888,-10.849813,6.442380,-10.859090,0.021840,100647.3359,100647.3359,...,269.616638,8.810000e-08,302.646820,302.014435,66.808281,12468.06934,66.808273,2013,9,16


In [92]:
# #Trying clustering:
# from sklearn.cluster import KMeans ,AgglomerativeClustering

# # creates 5 clusters using hierarchical clustering.
# agc = AgglomerativeClustering(n_clusters = 5, affinity='euclidean', linkage='ward')
# submit_df['cluster_lat'] = agc.fit_predict(submit_df[['lat']])

# agc = AgglomerativeClustering(n_clusters = 7, affinity='euclidean', linkage='ward')
# submit_df['cluster_lon'] = agc.fit_predict(submit_df[['lon']])

In [93]:
submit_df

Unnamed: 0,lat,lon,TMQ,U850,V850,UBOT,VBOT,QREFHT,PS,PSL,...,T500,PRECT,TS,TREFHT,Z1000,Z200,ZBOT,year,month,date
0,-31.095176,353.1250,22.372244,-1.994936,-0.002513,-7.732825,-1.864900,0.008203,102809.6719,102809.6719,...,258.135406,4.060000e-08,290.377930,289.014862,234.634003,12023.84082,63.400764,2013,9,27
1,-31.095176,353.4375,21.536032,-2.058918,0.397359,-8.436489,-0.940275,0.007887,102816.6016,102816.6016,...,258.205994,1.670000e-08,290.377228,288.873138,234.982056,12023.83301,63.348942,2013,9,27
2,-31.095176,353.7500,20.465317,-2.128485,0.717785,-8.842417,0.152449,0.007695,102827.8594,102827.8594,...,258.242279,1.220000e-08,290.377228,288.641480,235.631378,12025.41113,63.281322,2013,9,27
3,-31.095176,354.0625,19.967148,-2.126235,1.123356,-8.773702,1.251795,0.007519,102830.5391,102830.5391,...,258.335113,3.460000e-10,290.441406,288.420074,235.608460,12026.61426,63.216827,2013,9,27
4,-31.095176,354.3750,19.598663,-2.053118,1.823581,-8.472447,2.225254,0.007293,102823.8516,102823.8516,...,258.513702,5.920000e-10,290.468903,288.392975,235.004334,12027.29004,63.200485,2013,9,27
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7315,24.054759,276.5625,51.415295,-1.095974,-8.194263,2.484773,-10.520496,0.020928,100743.8438,100743.8438,...,268.880676,6.870000e-15,302.576569,302.226227,66.832077,12467.79492,66.832077,2013,9,16
7316,24.054759,276.8750,52.377407,-0.265653,-8.730537,3.783044,-10.748092,0.021389,100703.5313,100703.5313,...,269.085083,9.240000e-10,302.583557,302.131012,66.827492,12466.41309,66.827492,2013,9,16
7317,24.054759,277.1875,54.639217,0.775797,-9.646189,5.087689,-10.786784,0.021745,100673.9844,100673.9844,...,269.305939,5.870000e-08,302.646820,302.032715,66.812981,12467.56934,66.812981,2013,9,16
7318,24.054759,277.5000,56.121231,1.813888,-10.849813,6.442380,-10.859090,0.021840,100647.3359,100647.3359,...,269.616638,8.810000e-08,302.646820,302.014435,66.808281,12468.06934,66.808273,2013,9,16


In [97]:
def standardizer_new(X_train):
    df = pd.DataFrame(X_train)
    #df = df.drop(columns=[0])
    df = (df-df.mean())/df.std()
    X_train = df.to_numpy()
    #num_of_train = X_train.shape[0]
    #X_train = np.hstack((np.ones((num_of_train,1)),X_train))
    return X_train

In [98]:
X_submit = standardizer_new(submit_df)

In [99]:
X_submit.shape

(7320, 21)

In [100]:
predictions_submit = final_clf_ada.predict(X_submit)

In [101]:
predictions_df = pd.DataFrame(data=predictions_submit,columns=["LABELS"]).astype(int)
#https://stackoverflow.com/questions/18022845/pandas-index-column-title-or-name
predictions_df.index.name = "S.No"
predictions_df.to_csv(r"./ift3395-6390-weatherevents/submission_ada.csv")