In [2]:
import numpy as np
import pandas as pd

In [3]:
class DataHandler:
    def __init__(self):
        print("intialisation")
        self.df_lf = None
        self.df_pa = None
        self.df_res = None
        print("intialisation done")
    def get_data(self):
        print("loading data from bucket")
        self.df_lf = pd.read_csv("https://storage.googleapis.com/h3-data/listings_final.csv",sep=';')
        self.df_pa = pd.read_csv("https://storage.googleapis.com/h3-data/price_availability.csv",sep=';')
        print("data loaded from bucket")

    def group_data(self):
        print("merging data")
        self.df_res = pd.merge(self.df_lf,self.df_pa.groupby('listing_id')['local_price'].mean('local_price'),how='inner', on='listing_id')
        print("size of the merged data : {} lines, {} columns".format(self.df_res.shape[0],self.df_res.shape[1]))

    def get_process_data(self):
        self.get_data()
        self.group_data()

In [4]:
dt = DataHandler()

intialisation
intialisation done


In [5]:
dt.get_process_data()

loading data from bucket
data loaded from bucket
merging data
size of the merged data : 999 lines, 20 columns


In [6]:
class FeatureRecipe:

    def __init__(self,df:pd.DataFrame):
        print("FeatureRecipe intialisation")
        self.df = df
        self.cate = []
        self.floa = []
        self.intt = []
        self.drop = []
        print("end of intialisation\n")

    def separate_variable_types(self) -> None:
        print("separating columns")
        for col in self.df.columns:
            if self.df[col].dtypes == int:
                self.intt.append(self.df[col])
            elif self.df[col].dtypes == float:
                self.floa.append(self.df[col])
            else:
                self.cate.append(self.df[col])
        print ("dataset column size : {} \nnumber of discreet values : {} \nnumber of continuous values : {} \nnumber of others : {} \ntaille total : {}".format(len(self.df.columns),len(self.intt),len(self.floa),len(self.cate),len(self.intt)+len(self.floa)+len(self.cate) ))

    def drop_na_prct(self,threshold : float):
        """
            on appelle la commande et on met un threshold entre 1 et 0 en flottant
            params: threshold : float
        """
        # par rapport a la colonne
        dropped = 0
        print("dropping columns with {} percentage ".format(threshold))
        for col in self.df.columns:
            if self.df[col].isna().sum()/self.df.shape[0] >= threshold:
                self.drop.append( self.df.drop([col], axis='columns', inplace=True) )
                dropped+=1
        print("dropped {} columns".format(dropped))

    def drop_useless_features(self):
        # droper les col vides et doublons de l'index et les colonnes qu'on va considerer inutile
        print("dropping useless columns")
        dropped = 0
        if 'Unnamed: 0' in self.df.columns:
            self.df.drop(['Unnamed: 0'], axis='columns', inplace=True)
            dropped+=1
        for col in self.df.columns:
            if self.df[col].isna().sum() == len(self.df):
                self.df.drop([col], axis='columns', inplace=True)
                dropped+=1
        print("done dropping {} column(s)".format(dropped))


    def drop_duplicate(self):
        # comparer les colonnes et voir si elles sont dupliquées
        print("dropping duplicated rows")
        self.df.drop_duplicates(inplace=True)
        duplicates = self.get_duplicates()
        for col in duplicates:
            print("dropping column :{}".format(col))
            self.df.drop([col], axis='columns', inplace=True)
        print("duplicated rows dropped")

    def get_duplicates(self):
        duplicates = []
        #for col in self.df.columns:
            #for scol in self.df.columns:
        for col in range(self.df.shape[1]):
            for scol in range(col+1,self.df.shape[1]):
                if self.df.iloc[:,col].equals(self.df.iloc[:,scol]):
                    duplicates.append(self.df.iloc[:,scol].name)
        return duplicates

#   def deal_date_time(self):
#        pass
    def get_process_data(self,threshold : float):
        self.separate_variable_types()
        self.drop_useless_features()
        self.drop_na_prct(threshold)
        self.drop_duplicate()
        print("end of FeatureRecipe processing")

In [7]:
fr = FeatureRecipe(dt.df_res)
fr.get_process_data(0.3)

FeatureRecipe intialisation
end of intialisation

separating columns
dataset column size : 20 
number of discreet values : 5 
number of continuous values : 6 
number of others : 9 
taille total : 20
dropping useless columns
done dropping 1 column(s)
dropping columns with 0.3 percentage 
dropped 0 columns
dropping duplicated rows
dropping column :is_business_travel_ready
duplicated rows dropped
end of FeatureRecipe processing


In [8]:
import sklearn as skn
import matplotlib as plt
from sklearn.model_selection import train_test_split

In [9]:

class FeatureExtractor:
    """
    Feature Extractor class
    """
    def __init__(self, data: pd.DataFrame, flist: list):
        """
            Input : pandas.DataFrame, feature list to drop
            Output : X_train, X_test, y_train, y_test according to sklearn.model_selection.train_test_split
        """
        print("FeatureExtractor intialisation")
        self.X_train, self.X_test, self.y_train, self.y_test = None,None,None,None
        self.df = data
        self.flist = flist
        print("intialisation done")

    def extractor(self):
        print("extracting unwanted columns")
        for col in self.flist:
            if col in self.df:
                self.df.drop(col, axis=1, inplace=True)
        print("done extracting unwanted columns")

    def splitting(self, size:float,rng:int, y:str):
        print("splitting dataset for train and test")
        x = self.df.loc[:,self.df.columns != y]
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(x, self.df[y], test_size=size, random_state=rng)
        print("splitting done")

    def get_process_data(self):
        self.extractor()
        self.splitting(0.3,42,'local_price')
        print("done processing Feature Extractor")
        return self.X_train, self.X_test, self.y_train, self.y_test


In [10]:
flist = ['listing_id','name','type','city','neighborhood','latitude','longitude','is_rebookable','is_new_listing','is_fully_refundable','is_host_highly_rated']

In [11]:
fe = FeatureExtractor(fr.df,flist)
X_trn, X_tst, y_trn, y_tst = fe.get_process_data()

FeatureExtractor intialisation
intialisation done
extracting unwanted columns
done extracting unwanted columns
splitting dataset for train and test
splitting done
done processing Feature Extractor


In [12]:
fe.y_test

453      79.810390
793     125.000000
209      95.453333
309      29.000000
740      82.588391
          ...     
314     718.092784
404     517.996183
7       213.324607
155      79.000000
809    1300.000000
Name: local_price, Length: 300, dtype: float64

In [19]:
plt.pyplot.plot(fe.df["listing_id"],fe.df["local_price"])

AttributeError: module 'matplotlib' has no attribute 'pyplot'