In [18]:
import pandas as pd
import numpy as np
import seaborn as sb
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix,f1_score,accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
import pickle
import xgboost 
from sklearn.svm import SVC

In [17]:
df = pd.read_csv("C:/Users/ariim/Desktop/RainNextDay/weatherAUS.csv", sep =",")

In [19]:
df.dropna(subset=["RainTomorrow","RainToday"],inplace=True)
for var in df.columns.tolist():
    if (df[var].isna().sum()/df.shape[0])*100<20:
        if df[var].dtypes=="object":
            df[var].fillna(df[var].mode().iloc[0],inplace=True)
        else:
            df[var].fillna(df[var].median(),inplace=True)
    else:
        df.drop([var],axis=1,inplace=True)

        print(df.shape)

(140787, 19)


In [20]:
df["Date"]=pd.to_datetime(df["Date"],format="%Y-%m-%d")
df["Month"]=df["Date"].dt.month
month={1:"Janvier",2:"Fevrier",3:"Mars",4:"Avril",5:"Mai",6:"Juin",7:"Juillet",8:"Aout",9:"Septembre",10:"Octobre",11:"Novembre",12:"Decembre"}
df["Month"]=df["Month"].map(month)
df.drop(["Date"],axis=1,inplace=True)

In [21]:
Q1 = df.quantile(0.25)
Q3 = df.quantile(0.75)
IQR = Q3 - Q1
for idx in IQR.index:
    
    df=df[((df[idx]>(Q1[idx] - 1.5 * IQR[idx]))&(df[idx]<(Q3[idx] + 1.5 * IQR[idx])))]

In [192]:
# from sklearn.svm import LinearSVC
# from sklearn.feature_selection import SelectFromModel
# lsvc = LinearSVC(C=0.01, penalty="l1", dual=False).fit(X, y)
# model = SelectFromModel(lsvc, prefit=True)
# X = model.transform(X)

In [22]:
lr=LogisticRegression()
rf = RandomForestClassifier()
svc = SVC()
xg = xgboost.XGBClassifier()
md = [{ "name": "linear regression","model": lr, "param":{ "penalty":["l1","l2"],
                                 "solver":["newton-cg", "lbfgs", "liblinear","sag", "saga"]
                               }
      },
       {
         "name": "random forest","model":rf , "param":{
                                 "n_estimators":[100,150,200],"criterion":["gini","entropy"],"max_features":["sqrt", "log2"],"warm_start":[False,True]
                             }  
       },
#        {
#          "name": "support vector","model":svc , "param":{
#                                  "C":[1.0,0.5,2,0.3],
#                                  "kernel":["linear","poly", "rbf", "sigmoid", "precomputed"],
#                                  "degree":[3,4],
#                                  "decision_function_shape":["ovo","ovr"]
#                              }  
#        },
       {
         "name": "xgboost","model": xg , "param":{
                                  'objective':['binary:logistic'],
                                  'learning_rate': [0.05,0.01,0.03],
                                  'n_estimators': [100,300,500],
                                  'missing':[-999]}
                             
       }
         ]

In [None]:
def col(df,location,y):
    lst_=df.columns.tolist()
    lst_.remove(y)
    X=df[df["Location"]==location].copy()
    X=X[lst_]
    X.drop("Location",axis=1,inplace=True)
    X=pd.get_dummies(X)
    return X.columns.tolist()
list_col={}
for loc in df.Location.unique().tolist():
    print(loc)
    list_col[loc]=col(df,loc,"RainTomorrow")

In [37]:
with open("list_col","wb") as file:
    pickle.dump(list_col,file)

In [74]:
def modelise_gal(df,y,map_y,model_list):
    smote = SMOTE()
    lst_=df.columns.tolist()
    lst_.remove(y)
    df_=df.copy()
    X=df_[lst_]
    y=df[y].map(map_y)
    X=pd.get_dummies(X)
    sc = StandardScaler()
    sc.fit(X)
    X = sc.transform(X)
    X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.30,stratify=y, random_state=42)
    X_train, y_train = smote.fit_resample(X_train, y_train)
    ret={}
    for i in range(len(md)):
        print("     ",md[i]["name"])
        clf = GridSearchCV(md[i]["model"], md[i]["param"],cv=5,scoring=["f1"],refit='f1',n_jobs=-1)
        clf.fit(X_train, y_train)
        ret[md[i]["name"]]={"estimator":clf.best_estimator_,"best_param":clf.best_params_,"score":clf.best_score_ ,"scaler":sc,"X_test":X_test,"y_test":y_test}
    return ret

In [12]:
def modelise(df,location,y,map_y,model_list):
    smote = SMOTE()
    lst_=df.columns.tolist()
    lst_.remove(y)
    X=df[df["Location"]==location].copy()
    y=X[y].map(map_y)
    X=X[lst_]
    X.drop("Location",axis=1,inplace=True)
    X=pd.get_dummies(X)
    sc = StandardScaler()
    X = sc.fit_transform(X)
    X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.30,stratify=y, random_state=42)
    X_train, y_train = smote.fit_resample(X_train, y_train)
    ret={}
    for i in range(len(md)):
        print("     ",md[i]["name"])
        clf = GridSearchCV(md[i]["model"], md[i]["param"],cv=5,scoring=["f1"],refit='f1',n_jobs=-1)
        clf.fit(X_train, y_train)
        ret[md[i]["name"]]={"estimator":clf.best_estimator_,"best_param":clf.best_params_,"score":clf.best_score_, "scaler":sc,"X_test":X_test,"y_test":y_test}
    return ret

In [13]:
resultat_models_par_loc={}
for loc in df.Location.unique().tolist():
    print(loc)
    resultat_models_par_loc[loc]=modelise(df,loc,"RainTomorrow",{"Yes":1,"No":0},md)

Albury
      linear regression




      random forest
      xgboost
BadgerysCreek
      linear regression
      random forest
      xgboost
Cobar
      linear regression




      random forest
      xgboost
CoffsHarbour
      linear regression
      random forest
      xgboost
Moree
      linear regression
      random forest
      xgboost
Newcastle
      linear regression




      random forest
      xgboost
NorahHead
      linear regression
      random forest
      xgboost
NorfolkIsland
      linear regression
      random forest
      xgboost
Penrith
      linear regression
      random forest
      xgboost
Richmond
      linear regression
      random forest
      xgboost
Sydney
      linear regression
      random forest
      xgboost
SydneyAirport
      linear regression
      random forest
      xgboost
WaggaWagga
      linear regression
      random forest
      xgboost
Williamtown
      linear regression
      random forest
      xgboost
Wollongong
      linear regression
      random forest
      xgboost
Canberra
      linear regression
      random forest
      xgboost
Tuggeranong
      linear regression
      random forest
      xgboost
MountGinini
      linear regression
      random forest
      xgboost
Ballarat
      linear regression




      random forest
      xgboost
Bendigo
      linear regression
      random forest
      xgboost
Sale
      linear regression
      random forest
      xgboost
MelbourneAirport
      linear regression




      random forest
      xgboost
Melbourne
      linear regression
      random forest
      xgboost
Mildura
      linear regression
      random forest
      xgboost
Nhil
      linear regression
      random forest
      xgboost
Portland
      linear regression
      random forest
      xgboost
Watsonia
      linear regression
      random forest
      xgboost
Dartmoor
      linear regression




      random forest
      xgboost
Brisbane
      linear regression
      random forest
      xgboost
Cairns
      linear regression




      random forest
      xgboost
GoldCoast
      linear regression
      random forest
      xgboost
Townsville
      linear regression
      random forest
      xgboost
Adelaide
      linear regression
      random forest
      xgboost
MountGambier
      linear regression
      random forest
      xgboost
Nuriootpa
      linear regression
      random forest
      xgboost
Woomera
      linear regression




      random forest
      xgboost
Albany
      linear regression
      random forest
      xgboost
Witchcliffe
      linear regression
      random forest
      xgboost
PearceRAAF
      linear regression




      random forest
      xgboost
PerthAirport
      linear regression
      random forest
      xgboost
Perth
      linear regression
      random forest
      xgboost
SalmonGums
      linear regression
      random forest
      xgboost
Walpole
      linear regression
      random forest
      xgboost
Hobart
      linear regression
      random forest
      xgboost
Launceston
      linear regression




      random forest
      xgboost
AliceSprings
      linear regression




      random forest
      xgboost
Darwin
      linear regression




      random forest
      xgboost
Katherine
      linear regression
      random forest
      xgboost
Uluru
      linear regression
      random forest
      xgboost


In [14]:
with open('per_location_models', 'wb') as file:
    pickle.dump(resultat_models_par_loc, file)

In [15]:
resultat_models_gal=modelise_gal(df,"RainTomorrow",{"Yes":1,"No":0},md)

      linear regression
      random forest
      xgboost


KeyboardInterrupt: 

In [None]:
with open('General_models', 'wb') as file:
    pickle.dump(resultat_models_gal, file)