In [1]:
import findspark
import pyspark
import datetime, warnings, scipy 
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.patches as patches
from matplotlib.patches import ConnectionPatch
from collections import OrderedDict
from matplotlib.gridspec import GridSpec
from sklearn import metrics, linear_model
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict
from scipy.optimize import curve_fit
plt.rcParams["patch.force_edgecolor"] = True
plt.style.use('fivethirtyeight')
mpl.rc('patch', edgecolor = 'dimgray', linewidth=1)
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "last_expr"
pd.options.display.max_columns = 50
%matplotlib inline
warnings.filterwarnings("ignore")

In [2]:
import os
os.getcwd()

'/home/jovyan'

In [3]:
from pyspark.sql import SparkSession
from pyspark import SparkContext

In [4]:
spark = SparkSession \
    .builder \
    .appName("Pysparkexample") \
    .config("spark.some.config.option", "some-value") \
    .getOrCreate()

In [5]:
import os
os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages com.databricks:spark-xml_2.11:0.4.1 pyspark-shell'

In [6]:
df_load = spark.read.csv('hdfs://192.168.122.206:8020/data/vol/depart/*.csv',header='true')

In [7]:
df=df_load.orderBy('last_update_fme', ascending = False).dropDuplicates(['aircraft_aircrafttype_iatacode','timestamps_eobt','airports_next_name'])

In [8]:
df_load_meteo = spark.read.csv('hdfs://192.168.122.206:8020/data/meteo/*.csv')

In [9]:
df_load_meteo=df_load_meteo.withColumnRenamed("_c0","index").withColumnRenamed("_c1","numero").withColumnRenamed("_c2","pression").withColumnRenamed("_c3","direction_de_vent").withColumnRenamed("_c4","vitesse_de_vent").withColumnRenamed("_c5","temperature").withColumnRenamed("_c6","humidité").withColumnRenamed("_c7","visibilité").withColumnRenamed("_c8","nebulosite").withColumnRenamed("_c9","hauteur_neige").withColumnRenamed("_c10","precipitations_1").withColumnRenamed("_c11","precipitations_3").withColumnRenamed("_c12","nom").withColumnRenamed("_c13","longitude").withColumnRenamed("_c14","latitude").withColumnRenamed("_c15","timestamp") 

In [10]:
from pyspark.sql.functions import from_unixtime,from_utc_timestamp
func =  lambda x:from_unixtime(x, format='yyyy-MM-dd HH:mm:ss')
df_load_meteo=df_load_meteo.withColumn('timestamp',func(df_load_meteo['timestamp']))


In [11]:
df_load_meteo=df_load_meteo.dropDuplicates(['nom','timestamp'])

In [12]:
import pyspark.sql.functions as F
df_meteo=df_load_meteo.withColumn("timestamp_30_before", df_load_meteo.timestamp - F.expr('INTERVAL 30 MINUTES') )
df_meteo=df_meteo.withColumn("timestamp_30_after", df_load_meteo.timestamp + F.expr('INTERVAL 30 MINUTES') )

In [13]:
df_Lyon_meteo=df_meteo.filter("nom = 'Arrondissement de Lyon'")

In [14]:
df_Lyon_meteo.printSchema()


root
 |-- index: string (nullable = true)
 |-- numero: string (nullable = true)
 |-- pression: string (nullable = true)
 |-- direction_de_vent: string (nullable = true)
 |-- vitesse_de_vent: string (nullable = true)
 |-- temperature: string (nullable = true)
 |-- humidité: string (nullable = true)
 |-- visibilité: string (nullable = true)
 |-- nebulosite: string (nullable = true)
 |-- hauteur_neige: string (nullable = true)
 |-- precipitations_1: string (nullable = true)
 |-- precipitations_3: string (nullable = true)
 |-- nom: string (nullable = true)
 |-- longitude: string (nullable = true)
 |-- latitude: string (nullable = true)
 |-- timestamp: string (nullable = true)
 |-- timestamp_30_before: string (nullable = true)
 |-- timestamp_30_after: string (nullable = true)



In [15]:
joined_df = df.join(df_Lyon_meteo, [df.timestamps_eobt < df_Lyon_meteo.timestamp_30_after,df_Lyon_meteo.timestamp_30_before < df.timestamps_eobt], how='inner')

In [16]:
joined_df=joined_df.dropDuplicates(['aircraft_aircrafttype_iatacode','timestamps_eobt','airports_next_name'])

In [17]:
joined_df=joined_df.orderBy('timestamps_eobt').select(['aircraft_aircrafttype_icaocode','flightnumbers_icaoflightnumber', 'airportresources_terminal','airports_destination_name','airlines_airline_name','airlines_operator_icaocode','airlines_airline_icaocode','countrytype_code','timestamps_sobt','timestamps_eobt','pression', 'direction_de_vent',
       'vitesse_de_vent', 'temperature', 'humidité', 'visibilité',
       'nebulosite', 'hauteur_neige'])

In [18]:
def changethedate(s):
    tab=s.split("-")
    if len(tab[1])==1:
        tab[1]='0'+tab[1]
    s=tab[0]+'-'+tab[1]+'-'+tab[2]
    return s
import json
import requests
import time
import pandas as pd
url="https://pomber.github.io/covid19/timeseries.json"
            #print(url)
response = requests.get(url)
data_json=response.json()

dd=[]
for d in data_json:
        for l in data_json[d]:
                l['pays']=d
                dd.append(l)

covid_df= spark.createDataFrame(dd)
from pyspark.sql.functions import col, udf
from pyspark.sql.types import StringType
func =  udf (lambda x: changethedate(x),StringType())
covid_df=covid_df.withColumn('new_date',func(covid_df['date']))
from pyspark.sql.types import DateType
func =  udf (lambda x: datetime.datetime.strptime(x,'%Y-%m-%d'),DateType())
covid_df=covid_df.withColumn('date_obs',func(covid_df['new_date']))

#create data frame
covid_df=covid_df.select('pays','date_obs','confirmed')

In [19]:
covid_df=covid_df.filter(F.col('pays')=='France')

In [20]:
from pyspark.sql.functions import from_unixtime,from_utc_timestamp
func =  lambda x:pyspark.sql.functions.split(x, ' ')
joined_df=joined_df.withColumn('datedepart',func(joined_df['timestamps_sobt']).getItem(0))

In [21]:
df_Glob = joined_df.join(covid_df, [joined_df.datedepart==covid_df.date_obs], how='left')

In [22]:
from pyspark.sql.functions import coalesce
df_Glob=df_Glob.withColumn('date_obs', coalesce('date_obs', 'datedepart'))
df_Glob=df_Glob.na.fill({'pays': 'France', 'confirmed': 0})

In [23]:
from pyspark.sql.functions import isnan, when, count, col

df_Glob.select([count(when(isnan(c), c)).alias(c) for c in df_Glob.columns]).show()

+------------------------------+------------------------------+-------------------------+-------------------------+---------------------+--------------------------+-------------------------+----------------+---------------+---------------+--------+-----------------+---------------+-----------+--------+----------+----------+-------------+----------+----+--------+---------+
|aircraft_aircrafttype_icaocode|flightnumbers_icaoflightnumber|airportresources_terminal|airports_destination_name|airlines_airline_name|airlines_operator_icaocode|airlines_airline_icaocode|countrytype_code|timestamps_sobt|timestamps_eobt|pression|direction_de_vent|vitesse_de_vent|temperature|humidité|visibilité|nebulosite|hauteur_neige|datedepart|pays|date_obs|confirmed|
+------------------------------+------------------------------+-------------------------+-------------------------+---------------------+--------------------------+-------------------------+----------------+---------------+---------------+--------+--

In [24]:
def to_part_of_day(x):
    if x < 12:
        return "morning"
    elif x < 18:
        return "afternoon"
    else:
        return "evening"
def change(row):
    row['part_of_day']=to_part_of_day(row['timestamps_sobt'].hour)
    if((row['timestamps_eobt']-row['timestamps_sobt']).seconds/60 > 0):
        row['delay']=int((row['timestamps_eobt']-row['timestamps_sobt']).seconds/60)
    else :
        row['delay']=0
    if(int(row['delay'])<=15):
        row['delayLabel']='[0-15]'
    elif(15<int(row['delay'])<=30) :
        row['delayLabel']='[15-30]'
    elif(30<int(row['delay'])<=45) :
        row['delayLabel']='[30-45]'
    elif(45<int(row['delay'])<=60) :
        row['delayLabel']='[45-60]'
    elif(60<int(row['delay'])<=75) :
        row['delayLabel']='[60-75]'
    elif(75<int(row['delay'])<=90) :
        row['delayLabel']='[75-90]'
    elif(105<int(row['delay'])<=120) :
        row['delayLabel']='[105-120]'
    elif(120<int(row['delay'])<=135) :
        row['delayLabel']='[120-135]'
    elif(90<int(row['delay'])<=105) :
        row['delayLabel']='[90-105]'
    elif(150<int(row['delay'])<=165) :
        row['delayLabel']='[150-165]'
    elif(165<int(row['delay'])<=180) :
        row['delayLabel']='[165-180]'
    elif(180<int(row['delay'])<=195) :
        row['delayLabel']='[180-195]'
    elif(210<int(row['delay'])<=225) :
        row['delayLabel']='[210-225]'
    elif(225<int(row['delay'])<=240) :
        row['delayLabel']='[225-240]'
    elif(240<int(row['delay'])<=255) :
        row['delayLabel']='[240-255]'
    elif(255<int(row['delay'])<=270) :
        row['delayLabel']='[255-270]'
    elif(270<int(row['delay'])<=285) :
        row['delayLabel']='[270-285]'
    elif(285<int(row['delay'])<=300) :
        row['delayLabel']='[285-300]'
    elif(195<int(row['delay'])<=210) :
        row['delayLabel']='[195-210]'
    else :
        row['delayLabel']='more then 4 hours'
    
    return row
            

In [25]:
pdf=df_Glob.toPandas()

In [26]:
from datetime import datetime
pdf['timestamps_eobt'] = pdf['timestamps_eobt'].apply(lambda x: datetime.strptime(str(x), "%Y-%m-%d %H:%M:%S"))
pdf['timestamps_sobt'] =pdf['timestamps_sobt'].apply(lambda x: datetime.strptime(str(x), "%Y-%m-%d %H:%M:%S"))
pdf=pdf.apply(lambda x:change(x),axis=1)

In [27]:
pdf=pdf.astype({'pression': 'float64'})
pdf=pdf.astype({'direction_de_vent': 'float64'})
pdf=pdf.astype({'vitesse_de_vent': 'float64'})

pdf=pdf.astype({'temperature': 'float64'})
pdf=pdf.astype({'humidité': 'float64'})
pdf=pdf.astype({'visibilité': 'float64'})
pdf=pdf.astype({'hauteur_neige': 'float64'})
pdf=pdf.astype({'nebulosite': 'float64'})
pdf=pdf.astype({'confirmed': 'float64'})

pdf=pdf.drop(['timestamps_eobt','date_obs','datedepart'], axis=1)

In [28]:
pdf.columns

Index(['aircraft_aircrafttype_icaocode', 'flightnumbers_icaoflightnumber',
       'airportresources_terminal', 'airports_destination_name',
       'airlines_airline_name', 'airlines_operator_icaocode',
       'airlines_airline_icaocode', 'countrytype_code', 'timestamps_sobt',
       'pression', 'direction_de_vent', 'vitesse_de_vent', 'temperature',
       'humidité', 'visibilité', 'nebulosite', 'hauteur_neige', 'pays',
       'confirmed', 'part_of_day', 'delay', 'delayLabel'],
      dtype='object')

In [29]:
import featuretools as ft


es = ft.EntitySet(id = 'flight')



In [30]:
es = es.entity_from_dataframe(entity_id = 'flight', dataframe = pdf,index="id_Flight")



In [31]:
es

Entityset: flight
  Entities:
    flight [Rows: 17556, Columns: 23]
  Relationships:
    No relationships

In [32]:
primitives = ft.list_primitives()
pd.options.display.max_colwidth = 100
primitives[primitives['type'] == 'aggregation'].head(100)

Unnamed: 0,name,type,description
0,last,aggregation,Determines the last value in a list.
1,mode,aggregation,Determines the most commonly repeated value.
2,std,aggregation,"Computes the dispersion relative to the mean value, ignoring `NaN`."
3,entropy,aggregation,Calculates the entropy for a categorical variable
4,sum,aggregation,"Calculates the total addition, ignoring `NaN`."
5,median,aggregation,Determines the middlemost number in a list of values.
6,time_since_first,aggregation,Calculates the time elapsed since the first datetime (in seconds).
7,avg_time_between,aggregation,Computes the average number of seconds between consecutive events.
8,trend,aggregation,Calculates the trend of a variable over time.
9,percent_true,aggregation,Determines the percent of `True` values.


In [33]:
primitives[primitives['type'] == 'transform'].head(1000)

Unnamed: 0,name,type,description
22,and,transform,Element-wise logical AND of two lists.
23,greater_than_equal_to_scalar,transform,Determines if values are greater than or equal to a given scalar.
24,month,transform,Determines the month value of a datetime.
25,weekday,transform,Determines the day of the week from a datetime.
26,longitude,transform,Returns the second tuple value in a list of LatLong tuples.
27,modulo_numeric_scalar,transform,Return the modulo of each element in the list by a scalar.
28,greater_than_scalar,transform,Determines if values are greater than a given scalar.
29,is_null,transform,Determines if a value is null.
30,divide_numeric_scalar,transform,Divide each element in the list by a scalar.
31,divide_by_feature,transform,Divide a scalar by each value in the list.


In [34]:
agg_primitives=primitives[primitives['type'] == 'aggregation']['name']
trans_primitives=primitives[primitives['type'] == 'transform']['name']

In [35]:
features, feature_names = ft.dfs(entityset = es, target_entity = 'flight', 
                                 agg_primitives = primitives[primitives['type'] == 'aggregation']['name'],
                                 trans_primitives = ['year', 'day', 'month', 'week','weekday','hour','is_weekend','minute','second'])

In [36]:
features

Unnamed: 0_level_0,aircraft_aircrafttype_icaocode,flightnumbers_icaoflightnumber,airportresources_terminal,airports_destination_name,airlines_airline_name,airlines_operator_icaocode,airlines_airline_icaocode,countrytype_code,pression,direction_de_vent,vitesse_de_vent,temperature,humidité,visibilité,nebulosite,hauteur_neige,pays,confirmed,part_of_day,delay,delayLabel,YEAR(timestamps_sobt),DAY(timestamps_sobt),MONTH(timestamps_sobt),WEEK(timestamps_sobt),WEEKDAY(timestamps_sobt),HOUR(timestamps_sobt),IS_WEEKEND(timestamps_sobt),MINUTE(timestamps_sobt),SECOND(timestamps_sobt)
id_Flight,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1
0,A320,EJU4459,T1,Tenerife,Easy Jet Europe,EJU,EJU,S,101100.0,310.0,10.80,279.68,45.0,10000.0,75.0,0.0,France,18.0,afternoon,0,[0-15],2020,26,2,9,2,14,False,0,0
1,B737,KLM1416,T2,Amsterdam,Klm,KLM,KLM,S,101100.0,310.0,10.80,279.68,45.0,10000.0,75.0,0.0,France,18.0,afternoon,0,[0-15],2020,26,2,9,2,14,False,10,0
2,A320,EJU4353,T1,Nantes,Easy Jet Europe,EJU,EJU,N,101100.0,310.0,10.80,279.68,45.0,10000.0,75.0,0.0,France,18.0,afternoon,2,[0-15],2020,26,2,9,2,14,False,10,0
3,A319,EJU4371,T1,Berlin-Schonefeld,Easy Jet Europe,EJU,EJU,S,101100.0,310.0,10.80,279.68,45.0,10000.0,75.0,0.0,France,18.0,afternoon,0,[0-15],2020,26,2,9,2,14,False,15,0
4,A320,AFR7643,T2,Paris C. De Gaulle,Air France,AFR,AFR,N,101200.0,300.0,9.30,279.75,42.0,10000.0,75.0,0.0,France,18.0,afternoon,17,[15-30],2020,26,2,9,2,14,False,15,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17551,AT45,CLG076,T2,Limoges,Chalair Aviation,CLG,CLG,N,101900.0,170.0,4.60,286.20,38.0,10000.0,0.0,0.0,France,71412.0,evening,0,[0-15],2020,5,4,14,6,20,True,10,0
17552,B738,DAH1139,T1,Bejaia,Air Algerie,DAH,DAH,I,100800.0,209.0,2.25,284.77,95.0,0.0,100.0,0.0,France,167305.0,afternoon,0,[0-15],2020,1,5,18,4,12,False,50,0
17553,B748,DAH1037,T1,Alger,Air Algerie,DAH,DAH,I,100800.0,209.0,2.25,284.77,95.0,0.0,100.0,0.0,France,167305.0,afternoon,0,[0-15],2020,1,5,18,4,13,False,35,0
17554,B738,DAH1461,T1,Constantine,Air Algerie,DAH,DAH,I,100800.0,270.0,3.24,285.39,95.0,0.0,100.0,0.0,France,167305.0,evening,0,[0-15],2020,1,5,18,4,18,False,20,0


In [37]:
pdf.dtypes

id_Flight                                  int64
aircraft_aircrafttype_icaocode            object
flightnumbers_icaoflightnumber            object
airportresources_terminal                 object
airports_destination_name                 object
airlines_airline_name                     object
airlines_operator_icaocode                object
airlines_airline_icaocode                 object
countrytype_code                          object
timestamps_sobt                   datetime64[ns]
pression                                 float64
direction_de_vent                        float64
vitesse_de_vent                          float64
temperature                              float64
humidité                                 float64
visibilité                               float64
nebulosite                               float64
hauteur_neige                            float64
pays                                      object
confirmed                                float64
part_of_day         

In [38]:
def get_stats(group):
    return {'min': group.min(), 'max': group.max(),
            'count': group.count(), 'mean': group.mean()}

global_stats = pdf['delay'].groupby(pdf['airlines_airline_name']).apply(get_stats).unstack()
global_stats = global_stats.sort_values('count')
global_stats

Unnamed: 0_level_0,min,max,count,mean
airlines_airline_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
A supprimer,0.0,0.0,1.0,0.000000
Free Bird Airlines,0.0,0.0,1.0,0.000000
Onur Air Tasimacilik,0.0,0.0,1.0,0.000000
Transavia Holland,0.0,0.0,1.0,0.000000
RBG,0.0,0.0,1.0,0.000000
...,...,...,...,...
British Airways,0.0,124.0,581.0,11.820998
Air Algerie,0.0,475.0,889.0,30.133858
Lufthansa,0.0,132.0,1054.0,5.759013
Easy Jet Europe,0.0,260.0,2774.0,16.969358


In [39]:
dicte=global_stats.to_dict()['mean']

In [40]:
def addmean(row):
    
    row['meandelay']=dicte[row['airlines_airline_name']]
    return row

In [41]:
f=features.apply(lambda x:addmean(x),axis=1)

In [42]:
f.dtypes

aircraft_aircrafttype_icaocode     object
flightnumbers_icaoflightnumber     object
airportresources_terminal          object
airports_destination_name          object
airlines_airline_name              object
airlines_operator_icaocode         object
airlines_airline_icaocode          object
countrytype_code                   object
pression                          float64
direction_de_vent                 float64
vitesse_de_vent                   float64
temperature                       float64
humidité                          float64
visibilité                        float64
nebulosite                        float64
hauteur_neige                     float64
pays                               object
confirmed                         float64
part_of_day                        object
delay                               int64
delayLabel                         object
YEAR(timestamps_sobt)               int64
DAY(timestamps_sobt)                int64
MONTH(timestamps_sobt)            

In [43]:
traindf=f.copy()

In [44]:
traindf.columns

Index(['aircraft_aircrafttype_icaocode', 'flightnumbers_icaoflightnumber',
       'airportresources_terminal', 'airports_destination_name',
       'airlines_airline_name', 'airlines_operator_icaocode',
       'airlines_airline_icaocode', 'countrytype_code', 'pression',
       'direction_de_vent', 'vitesse_de_vent', 'temperature', 'humidité',
       'visibilité', 'nebulosite', 'hauteur_neige', 'pays', 'confirmed',
       'part_of_day', 'delay', 'delayLabel', 'YEAR(timestamps_sobt)',
       'DAY(timestamps_sobt)', 'MONTH(timestamps_sobt)',
       'WEEK(timestamps_sobt)', 'WEEKDAY(timestamps_sobt)',
       'HOUR(timestamps_sobt)', 'IS_WEEKEND(timestamps_sobt)',
       'MINUTE(timestamps_sobt)', 'SECOND(timestamps_sobt)', 'meandelay'],
      dtype='object')

In [45]:
for c in traindf.columns:
    col_type = traindf[c].dtype
    if col_type == 'object' or col_type.name == 'category':
        traindf[c] = traindf[c].astype('category')

In [46]:
from sklearn.preprocessing import LabelEncoder

lb_make = LabelEncoder()

traindf[['aircraft_aircrafttype_icaocode','airportresources_terminal','airlines_airline_icaocode','airlines_operator_icaocode','flightnumbers_icaoflightnumber','airports_destination_name','airlines_airline_name','countrytype_code','IS_WEEKEND(timestamps_sobt)','part_of_day']]=traindf[['aircraft_aircrafttype_icaocode','airportresources_terminal','airlines_airline_icaocode','airlines_operator_icaocode','flightnumbers_icaoflightnumber','airports_destination_name','airlines_airline_name','countrytype_code','IS_WEEKEND(timestamps_sobt)','part_of_day']].apply(lb_make.fit_transform)


In [47]:
from sklearn.pipeline import Pipeline, make_pipeline
from imblearn.pipeline import Pipeline, make_pipeline
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier

In [48]:
import numpy as np
threshold = 100 # Anything that occurs less than this will be removed.
value_counts = traindf['delayLabel'].value_counts() # Specific column 
to_remove = value_counts[value_counts <= threshold].index
traindf['delayLabel'].replace(to_remove, np.nan, inplace=True)
traindf=traindf.dropna()

In [49]:

traindf=traindf.drop(['pays'], axis=1)

In [50]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
mlb = MultiLabelBinarizer()
X_train, X_test, y_train, y_test = train_test_split(traindf.drop(['delayLabel','delay'], axis=1), traindf['delayLabel'], test_size=0.33, random_state=0)

In [51]:
from sklearn.model_selection import KFold
kf = KFold(n_splits=5, random_state=42, shuffle=False)

In [52]:
params = {
    'bootstrap': [True],
    'n_estimators': [400],
    'max_depth': [20],
    'random_state': [13],
    'max_features': [18]
}


In [53]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [54]:
from imblearn.under_sampling import NearMiss
imba_pipeline = make_pipeline(SMOTE(sampling_strategy='not majority'), 
                              RandomForestClassifier(n_estimators=100, random_state=13))
cross_val_score(imba_pipeline, X_train, y_train, scoring='f1_micro', cv=kf)

array([0.736045  , 0.75670996, 0.74675325, 0.75108225, 0.74632035])

In [56]:
from sklearn.model_selection import GridSearchCV
new_params = {'randomforestclassifier__' + key: params[key] for key in params}
grid_imba = GridSearchCV(imba_pipeline, param_grid=new_params, cv=kf, scoring='f1_micro',
                        return_train_score=True)
grid_imba.fit(X_train, y_train);

In [57]:
grid_imba.best_params_

{'randomforestclassifier__bootstrap': True,
 'randomforestclassifier__max_depth': 20,
 'randomforestclassifier__max_features': 18,
 'randomforestclassifier__n_estimators': 400,
 'randomforestclassifier__random_state': 13}

In [58]:
grid_imba.best_score_

0.7388112565351854

In [59]:
import numpy as np
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import SVC
import xgboost as xgb
clf = OneVsRestClassifier(xgb.XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1.0,
         gamma=0.0, max_delta_step=0.0, min_child_weight=1.0,
         missing=None, n_jobs=-1, objective='binary:logistic', random_state=42, reg_alpha=0.0,
         reg_lambda=1.0, scale_pos_weight=1.0, tree_method='auto')).fit(X_train,y_train)

In [60]:
ypred=clf.predict(X_test)

In [61]:
from sklearn.metrics import classification_report, confusion_matrix
print(classification_report(y_test, ypred))

              precision    recall  f1-score   support

      [0-15]       0.86      0.97      0.91      4523
   [105-120]       0.40      0.22      0.28        37
     [15-30]       0.25      0.09      0.13       479
     [30-45]       0.25      0.16      0.19       231
     [45-60]       0.29      0.17      0.22       173
     [60-75]       0.30      0.27      0.29       113
     [75-90]       0.22      0.16      0.19        73
    [90-105]       0.42      0.13      0.20        61

    accuracy                           0.80      5690
   macro avg       0.37      0.27      0.30      5690
weighted avg       0.74      0.80      0.76      5690



In [62]:
import xgboost as xgb
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
xgbcl = xgb.XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1.0,
         gamma=0.0, max_delta_step=0.0, min_child_weight=1.0,
         missing=None, n_jobs=-1, objective='binary:logistic', random_state=42, reg_alpha=0.0,
         reg_lambda=1.0, scale_pos_weight=1.0, tree_method='auto')

kfold = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

param_grid = { 
    'colsample_bytree':[.75,1],
    'learning_rate':[0.01,0.05,0.1,0.3,0.5],
    'max_depth':[1,2,3,5],
    'subsample':[.75,1],
    'n_estimators': list(range(50, 400, 50))
}

grid_search = GridSearchCV(estimator=xgbcl, scoring='f1_micro', param_grid=param_grid, n_jobs=-1, cv=kfold)
grid_result = grid_search.fit(X_train, y_train)

print(f'Best: {grid_result.best_score_} using {grid_result.best_params_}','\n')

Best: 0.7938710246734136 using {'colsample_bytree': 0.75, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 350, 'subsample': 1} 



In [63]:
from imblearn.over_sampling import SMOTE

# Resample the minority class. You can change the strategy to 'auto' if you are not sure.
sm = SMOTE(sampling_strategy='not majority')

# Fit the y_train to generate the data.
X_train, y_train = sm.fit_sample(X_train, y_train)

In [64]:
import xgboost as xgb
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
xgbcl = xgb.XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1.0,
         gamma=0.0, max_delta_step=0.0, min_child_weight=1.0,
         missing=None, n_jobs=-1, objective='binary:logistic', random_state=42, reg_alpha=0.0,
         reg_lambda=1.0, scale_pos_weight=1.0, tree_method='auto')

kfold = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

param_grid = { 
    'colsample_bytree':[.75,1],
    'learning_rate':[0.01,0.05,0.1,0.3,0.5],
    'max_depth':[1,2,3,5],
    'subsample':[.75,1],
    'n_estimators': list(range(50, 400, 50))
}

grid_search = GridSearchCV(estimator=xgbcl, scoring='f1_micro', param_grid=param_grid, n_jobs=-1, cv=kfold)
grid_result = grid_search.fit(X_train, y_train)

print(f'Best: {grid_result.best_score_} using {grid_result.best_params_}','\n')

KeyboardInterrupt: 

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
clr = OneVsRestClassifier(XGBClassifier())
from imblearn.over_sampling import SMOTE

# Resample the minority class. You can change the strategy to 'auto' if you are not sure.
sm = SMOTE(sampling_strategy='not majority')

# Fit the y_train to generate the data.
X, Y = sm.fit_sample(X_train, y_train)
clr.fit(X, Y)

In [None]:
import numpy
numpy.mean(clr.predict(X_test).ravel() == y_test.ravel()) * 100

In [None]:
from sklearn.metrics import confusion_matrix
import pandas
df = pandas.DataFrame(confusion_matrix(y_test, clr.predict(X_test)))
try:
    df.columns = [str(_) for _ in clr.classes_][:df.shape[1]]
    df.index = [str(_) for _ in clr.classes_][:df.shape[0]]
except ValueError:
    # Il peut arriver qu'une classe ne soit pas représenter
    # lors de l'apprentissage
    print("erreur", df.shape, clr.classes_)
df

In [None]:
from sklearn.ensemble import RandomForestClassifier
rfc = XGBClassifier()
rfc.fit(X_train, y_train)
numpy.mean(rfc.predict(X_test).ravel() == y_test.ravel()) * 100

In [None]:
rf_train = clr.decision_function(X_train)

rfc_y = XGBClassifier()
rfc_y.fit(rf_train, y_train)

In [None]:
rf_test = rfc_y.predict(X_test)
numpy.mean(rf_test == y_test) * 100

In [None]:
from sklearn.metrics import classification_report, confusion_matrix
print(classification_report(y_test, rf_test))

In [None]:
X_train

In [None]:
param = {
    'max_depth': 3,  # the maximum depth of each tree
    'eta': 0.3,  # the training step for each iteration
    'silent': 1,  # logging mode - quiet
    'objective': 'multi:softprob',  # error evaluation for multiclass training
    'num_class': 3}  # the number of classes that exist in this datset
num_round = 20  

In [None]:
import xgboost
import shap# load JS visualization code to notebook
shap.initjs()# train XGBoost model
y=LabelEncoder().fit_transform(y_train)
model = xgboost.train({"learning_rate": 0.02}, xgboost.DMatrix(X_train, label=y), 100)# explain the model's predictions using SHAP values
# (same syntax works for LightGBM, CatBoost, and scikit-learn models)
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(X_train)
shap.summary_plot(shap_values, X_train, plot_type="bar")

In [None]:
from sklearn.feature_selection import SelectKBest,chi2
test = SelectKBest(score_func=chi2, k=)
fit = test.fit(X, y)
features = fit.transform(X)

In [None]:
print(features)

In [None]:
def get_models():
    models, names = list(), list()
    # RF
    models.append(RandomForestClassifier(n_estimators=1000))
    names.append('RF')
    return models, names

In [None]:
def evaluate_model(X, y, model):
    # define evaluation procedure
    cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=1)
    # evaluate model
    scores = cross_val_score(model, X, y, scoring='f1_micro', cv=cv, n_jobs=-1)
    return scores

In [None]:
pyplot.boxplot(results, labels=names, showmeans=True)
pyplot.show()

In [None]:
from sklearn.model_selection import RepeatedStratifiedKFold
models=get_models()
evaluate_model(X_train, y_train, models[0])

In [None]:
clf = RandomForestClassifier(max_depth=20, random_state=0,criterion = 'entropy',bootstrap= True,
    n_estimators=100,max_features=5)
clf.fit(X_train, y_train)

In [None]:
print(clf.feature_importances_)

In [None]:
ypred=clf.predict(X_test)

In [None]:
print(classification_report(y_test, ypred))


In [None]:
from sklearn.metrics import confusion_matrix
import pandas
df = pandas.DataFrame(confusion_matrix(y_test, clf.predict(X_test)))
try:
    df.columns = [str(_) for _ in clf.classes_][:df.shape[1]]
    df.index = [str(_) for _ in clf.classes_][:df.shape[0]]
except ValueError:
    # Il peut arriver qu'une classe ne soit pas représenter
    # lors de l'apprentissage
    print("erreur", df.shape, clr.classes_)
df

In [None]:
model.score(X_test,y_test)

In [None]:
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [None]:
model = XGBClassifier()
model.fit(X_train, y_train)

In [None]:
model = XGBClassifier(colsample_bytree=1,learning_rate=0.5,max_depth=2,subsample=1,n_estimators=400)
model.fit(X_train, y_train)

In [None]:

# make predictions for test data
y_pred2 = model.predict(X_test)

In [1]:
accuracy = accuracy_score(y_test, y_pred2)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

NameError: name 'accuracy_score' is not defined

In [None]:
print(classification_report(y_test, y_pred2))

In [None]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
def select_features(X_train, y_train, X_test):
    fs = SelectKBest(score_func=chi2, k=15)
    fs.fit(X_train, y_train)
    X_train_fs = fs.transform(X_train)
    X_test_fs = fs.transform(X_test)
    return X_train_fs, X_test_fs

In [None]:
X_train_fs, X_test_fs = select_features(X_train, y_train, X_test)

In [None]:
clf = RandomForestClassifier(max_depth=10, random_state=42,criterion = 'entropy',bootstrap= True,
    n_estimators=400,min_samples_split=50,n_jobs=1)
clf.fit(X_train, y_train)

In [None]:
clf.score(X_test,y_test)

In [None]:
from sklearn.preprocessing import OrdinalEncoder
def prepare_inputs(X_train, X_test):
    oe = OrdinalEncoder()
    oe.fit(X_train)
    X_train_enc = oe.transform(X_train)
    oe.fit(X_test)
    X_test_enc = oe.transform(X_test)
    return X_train_enc, X_test_enc
 
# prepare target
def prepare_targets(y_train, y_test):
    le = LabelEncoder()
    le.fit(y_train)
    y_train_enc = le.transform(y_train)
    y_test_enc = le.transform(y_test)
    return y_train_enc, y_test_enc

In [None]:
X_train, X_test = prepare_inputs(X_train, X_test)
# prepare output data
y_train, y_test = prepare_targets(y_train, y_test)

In [None]:
X_train

In [None]:
from sklearn.model_selection import RepeatedStratifiedKFold
cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=1)
rfc_cv_score = cross_val_score(clf, X_train, y_train, cv=cv, scoring='recall_micro')

In [None]:
rfc_cv_score

In [None]:
import sklearn
sklearn.metrics.SCORERS.keys()

In [None]:
param = {}
param['booster'] = 'gbtree'
param['objective'] = "multi:softmax"
param["eval_metric"] = "mlogloss"
param['eta'] = 0.3
param['gamma'] = 0
param['scale_pos_weight']=100
param['max_depth'] = 6
param['min_child_weight']=1
param['max_delta_step'] = 5
param['subsample']= 1
param['colsample_bytree']=1
param['silent'] = 1
param['seed'] = 0
param['base_score'] = 0.5
param['early_stopping_rounds'] = 10
param['learning_rate'] = 0.03
param['n_estimators'] = 10
import xgboost as xgb
clf = xgb.XGBClassifier(param)

In [None]:
from imblearn.over_sampling import ADASYN

smt = ADASYN(random_state=42)
X_smt, y_smt = smt.fit_sample(X_train, y_train)

In [None]:
boost = xgb.XGBClassifier(param)
boost.fit(X_train, y_train)
p_boost = boost.predict(X_train)
print ("Score Train -->", round(boost.score(X_test, y_test) *100,2), " %")


In [None]:
xgb.plot_importance(boost)

In [None]:
from sklearn.metrics import confusion_matrix
import pandas
df = pandas.DataFrame(confusion_matrix(y_test, boost.predict(X_test)))
try:
    df.columns = [str(_) for _ in boost.classes_][:df.shape[1]]
    df.index = [str(_) for _ in boost.classes_][:df.shape[0]]
except ValueError:
    # Il peut arriver qu'une classe ne soit pas représenter
    # lors de l'apprentissage
    print("erreur", df.shape, boost.classes_)
df

In [None]:
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.patches as patches
from matplotlib.patches import ConnectionPatch


fig_dim=(14,18)
f,ax=plt.subplots(figsize=fig_dim)
quality=y_smt.unique()
size=y_smt.value_counts()
plt.pie(size,labels=quality,autopct="%1.0f%%")
plt.show() 

In [None]:
import lightgbm as lgb

In [None]:
from lightgbm import LGBMClassifier


lgbm = LGBMClassifier(objective='multiclass', random_state=5,categorical_feature= 'auto',feature_name='auto')

lgbm.fit(X_train, y_train)

y_pred = lgbm.predict(X_test)

In [None]:
lgbm.score(X_test,y_test)

In [None]:
from sklearn.metrics import confusion_matrix
import pandas
df = pandas.DataFrame(confusion_matrix(y_test, lgbm.predict(X_test)))
try:
    df.columns = [str(_) for _ in lgbm.classes_][:df.shape[1]]
    df.index = [str(_) for _ in lgbm.classes_][:df.shape[0]]
except ValueError:
    # Il peut arriver qu'une classe ne soit pas représenter
    # lors de l'apprentissage
    print("erreur", df.shape, lgbm.classes_)
df

In [None]:
from xgboost import XGBClassifier

# manually handling imbalance. Below is same as computing float(18501)/392318 on the trainig dataset.
# We are going to inversely assign the weights
newlist=set(y_train)
newlist.remove('[0-15]')
total=0
w_array = np.array([1]*y_train.shape[0])
for classes in newlist:
    
    weight_ratio = float(len(y_train[y_train == '[0-15]']))/float(len(y_train[y_train == 
    classes]))
    w_array[y_train==classes] = weight_ratio
w_array[y_train=='[0-15]'] = 1
xgc = XGBClassifier()
xgc.fit(X_train, y_train, sample_weight=w_array) 


In [None]:
w_array

In [None]:
set(y_train)

In [None]:
print ("Score Train -->", round(xgc.score(X_test, y_test) *100,2), " %")

In [None]:
from sklearn.metrics import confusion_matrix
import pandas
df = pandas.DataFrame(confusion_matrix(y_test, xgc.predict(X_test)))
try:
    df.columns = [str(_) for _ in xgc.classes_][:df.shape[1]]
    df.index = [str(_) for _ in xgc.classes_][:df.shape[0]]
except ValueError:
    # Il peut arriver qu'une classe ne soit pas représenter
    # lors de l'apprentissage
    print("erreur", df.shape, xgc.classes_)
df

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test, xgc.predict(X_test)))

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_train, xgc.predict(X_train)))

In [None]:
w_array = np.ones(y_train.shape[0], dtype = 'float')
for i, val in enumerate(y_train):
    w_array[i] = class_weights[val]

In [None]:
weight = CreateBalancedSampleWeights(y_train, largest_class_weight_coef)

In [None]:
from imblearn.under_sampling import RandomUnderSampler
rus = RandomUnderSampler(random_state=42)
X_train, y_train = rus.fit_resample(X_train, y_train)