# **Imports and defined functions**

## **Imports**

In [1]:
# Libraries for data wrangling
import numpy as np
import pandas as pd

# Libraries for data visualisation
import matplotlib.pyplot as plt
import seaborn as sns

# Libraries for data preparation
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA 

## **Defined functions**

In [2]:
def haversine_manhattan(lat1, lon1, lat2, lon2):
    
    delta_lat = np.abs(np.radians(lat1) - np.radians(lat2))
    delta_lon = np.abs(np.radians(lon1) - np.radians(lon2))
    R = 6731 # Earth radius, in km

    a = np.sin(delta_lat / 2) ** 2
    b = 2 * np.arctan2(np.sqrt(a), np.sqrt(1 - a))
    latitude_distance = R * b

    c = np.sin(delta_lon / 2) ** 2
    d = 2 * np.arctan2(np.sqrt(c), np.sqrt(1 - c))
    longitude_distance = R * d

    d = np.abs(latitude_distance) + np.abs(longitude_distance) # in km

    return d # in km

In [3]:
def vectorize_and_pca(data, variable, ngram_range, token_pattern, n_components):
    """
        This function gets a categorial/text variable, vectorizes it, then performce PCA dimencial reduction.

        Parameters:
                data(pd.DataFrame): dataframe with data
                variable(str): name of a column to vectorize from
                ngram_range(tuple): tuple of values to make n_grams from. Examples: (1, 1), (2, 2), (3, 3)
                token_pattern(str): token pattern for CountVectorizer, used where specific splitter needed
                n_components(int): number of PCA-components

            Returns:
                Returns transformed and normed data
        
    """

    vec = CountVectorizer(ngram_range = ngram_range, token_pattern = token_pattern) # строим BoW для слов
    bow = vec.fit_transform(data[variable])
    
    pca = PCA(n_components = n_components)
    pca.fit(bow)

    col_names = [variable + '_pca_component_' + str(i) for i in range(1, n_components + 1)]
    data_transformed = pca.fit_transform(bow)
    data_transformed = pd.DataFrame(data_transformed, columns = col_names)
    
    scaler = MinMaxScaler().fit(data_transformed)
    data_transformed_normed = pd.DataFrame(scaler.fit_transform(data_transformed.values), columns = data_transformed.columns, index = data_transformed.index)

    return vec, pca, scaler, data_transformed_normed

In [4]:
def data_wrangling(df, lat_lon_poor_health):
    """
        Data wrangling of given df
    """

    list_of_results_haversine_manhattan = []

    for i in range(len(df)):
        
        data_merged = pd.merge(df.iloc[i:i+1], lat_lon_poor_health, how = 'cross')
        
        list_of_distances = haversine_manhattan(data_merged['latitude'], data_merged['longitude'], data_merged['latitude_x'], data_merged['longitude_x'])
        list_of_distances.replace(0, np.nan, inplace = True)
        list_of_results_haversine_manhattan.append(list_of_distances.min())
    
    df['haversine_manhattan_distance_to_poor_tree'] = list_of_results_haversine_manhattan
    
    df['tree_dbh_wrangled'] = np.where(df['tree_dbh'] > 35, 35, df['tree_dbh'])
    
    df['problems_count'] = df['problems'].str.split(',').str.len()
    df['problems_count'] = np.where(df['problems'] == 'None', 0, df['problems_count'])
    df['problems_count'] = np.where(df['problems_count'] > 4, 4, df['problems_count'])
    df['has_problem'] = np.where(df['problems_count'] > 0, 1, 0)
    
    list_of_problems = ['Stones', 'BranchLights', 'TrunkOther', 'BranchOther',
                    'RootOther', 'WiresRope', 'MetalGrates']
    for value in list_of_problems:
        colum_name = 'has_' + str(value) + '_problem'
        df[colum_name] = np.where(df['problems'].str.contains(','.join([value])), 1, 0)              


    steward_most_frequent = [x for x in data.steward.value_counts().sort_values(ascending = False).head(3).index]
    for value in steward_most_frequent:
        colum_name = 'has_' + str(value) + '_steward'
        df[colum_name] = np.where(df['steward'] == value, 1, 0)

    guards_most_frequent = [x for x in data.guards.value_counts().sort_values(ascending = False).head(3).index]
    for value in guards_most_frequent:
        colum_name = 'has_' + str(value) + '_guards'
        df[colum_name] = np.where(df['guards'] == value, 1, 0)

    user_type_most_frequent = [x for x in data.user_type.value_counts().sort_values(ascending = False).head(2).index]
    for value in user_type_most_frequent:
        colum_name = 'has_' + str(value) + '_user_type'
        df[colum_name] = np.where(df['user_type'] == value, 1, 0)

    borough_most_frequent = [x for x in data.borough.value_counts().sort_values(ascending = False).head(4).index]
    for value in borough_most_frequent:
        colum_name = 'has_' + str(value) + '_borough'
        df[colum_name] = np.where(df['borough'] == value, 1, 0)

    nta_name_most_frequent = [x for x in data.nta_name.value_counts().sort_values(ascending = False).head(5).index]
    for value in nta_name_most_frequent:
        colum_name = 'has_' + str(value) + '_nta_name'
        df[colum_name] = np.where(df['nta_name'] == value, 1, 0)

# **Data wrangling**

Импорт данных

In [5]:
data = pd.read_csv("C:\\Users\\skako\\Тестовое задание Максибитсолюшен\\data\\2015-street-tree-census-tree-data.csv")

data, data_validation = train_test_split(data, test_size = 0.1, random_state = 42)

Балансировка классов

In [6]:
class_3, class_2 , class_1 = data.health.value_counts()
c3 = data[data['health'] == 'Good']
c2 = data[data['health'] == 'Fair']
c1 = data[data['health'] == 'Poor']
df_3 = c3.sample(class_1)
df_2 = c2.sample(class_1)

data = pd.concat([df_3, df_2, c1],axis=0)
data.dropna(subset = 'health', inplace = True)
data['health'].value_counts()

health
Good    24137
Fair    24137
Poor    24137
Name: count, dtype: int64

In [7]:
data.reset_index(inplace = True, drop = True)
data_validation.reset_index(inplace = True, drop = True)

Сохранение необработанных выборок в репозиторий. 
data - сбалансированная выборка для обучение и тестирования моделей.
data_validation - часть несбалансированной выборки, котораыя будет использована для итогового тестирования отобранной модели. Это нужно для независимого тетсирования данных без влияния обучающих образцов, а также тестирования модели "как на проде".

In [8]:
data.to_csv(("C:\\Users\\skako\\Тестовое задание Максибитсолюшен\\data\\data_balanced_raw.csv"), index = False)
data_validation.to_csv(("C:\\Users\\skako\\Тестовое задание Максибитсолюшен\\data\\data_validation.csv"), index = False)

Отбираем необходимые поля:

In [9]:
columns_to_keep = ['tree_dbh', 'curb_loc', 'health', 'spc_common', 'steward',
                   'guards', 'sidewalk', 'user_type', 'problems', 'root_stone',
                   'root_grate', 'root_other', 'trunk_wire', 'trnk_light', 'trnk_other',
                   'brch_light', 'brch_shoe', 'brch_other', 'address', 
                   'zip_city', 'borough', 'nta_name', 'latitude', 'longitude']

data = data[columns_to_keep].copy()
data_validation = data_validation[columns_to_keep].copy()

Заполняем пропуски для данных полей значением "None", поскольку библиотекой Pandas они ошибочно распознаются как пропуски:

In [10]:
data.fillna({'steward':'None', 'guards':'None', 'problems':'None'}, inplace = True)
data_validation.fillna({'steward':'None', 'guards':'None', 'problems':'None'}, inplace = True)

Очистка малозначимых пропусков:

In [11]:
data.dropna(subset = 'spc_common', inplace = True)
data_validation.dropna(subset = 'spc_common', inplace = True)

Подготовка данных для формирования основных полей:

In [12]:
lat_lon_poor_health = data[data['health'] == 'Poor'].reset_index()[['latitude', 'longitude', 'health']]
lat_lon_poor_health.rename(columns={'latitude': 'latitude_x', 'longitude': 'longitude_x'}, inplace = True)

In [25]:
data_wrangling(data, lat_lon_poor_health)
data_wrangling(data_validation, lat_lon_poor_health)

Проведение связки "Bag of Words + PCA"

In [14]:
token_pattern = '(?u)[a-zA-Z][a-z ]+' # so that vectorizer don't split on spaces

vec_spc_common, pca_spc_common, scaler_spc_common, data_transformed_normed_spc_common = vectorize_and_pca(data, 'spc_common', (1, 1), token_pattern, 10)
vec_problems, pca_problems, scaler_problems, data_transformed_normed_problems = vectorize_and_pca(data, 'problems', (1, 1), token_pattern, 5)
vec_address, pca_address, scaler_address, data_transformed_normed_address = vectorize_and_pca(data, 'address', (2, 2), token_pattern, 5)
vec_zip_city, pca_zip_city, scaler_zip_city, data_transformed_normed_zip_city = vectorize_and_pca(data, 'zip_city', (1, 1), token_pattern, 5)
vec_nta_name, pca_nta_name, scaler_nta_name, data_transformed_normed_nta_name = vectorize_and_pca(data, 'nta_name', (2, 2), token_pattern, 5)

data = pd.merge(data, data_transformed_normed_spc_common, left_index=True, right_index=True)
data = pd.merge(data, data_transformed_normed_problems, left_index=True, right_index=True)
data = pd.merge(data, data_transformed_normed_address, left_index=True, right_index=True)
data = pd.merge(data, data_transformed_normed_zip_city, left_index=True, right_index=True)
data = pd.merge(data, data_transformed_normed_nta_name, left_index=True, right_index=True)

Скоринг валидационной выборки:

In [15]:
vecs = [vec_spc_common, vec_problems, vec_address, vec_zip_city, vec_nta_name]
pcas = [pca_spc_common, pca_problems, pca_address, pca_zip_city, pca_nta_name]
scalers = [scaler_spc_common, scaler_problems, scaler_address, scaler_zip_city, scaler_nta_name]
cols = ['spc_common', 'problems', 'address', 'zip_city', 'nta_name']

In [17]:
for i in range(len(vecs)):
    col_names = [cols[i] + '_pca_component_' + str(j) for j in range(1, pcas[i].n_components_ + 1)]
    data_transformed = pcas[i].transform(vecs[i].transform(data_validation[cols[i]]))
    data_transformed = pd.DataFrame(data_transformed, columns = col_names)
    
    scaler = scalers[i]
    data_transformed_normed = pd.DataFrame(scaler.transform(data_transformed.values), columns = data_transformed.columns, index = data_transformed.index)
    data_validation = pd.merge(data_validation, data_transformed_normed, left_index=True, right_index=True)

"one-hot encoding" бинарных переменных:

In [18]:
list_of_binary = ['curb_loc', 'sidewalk', 
                      'root_stone', 'root_grate', 'root_other', 
                      'trunk_wire', 'trnk_light', 'trnk_other', 
                      'brch_light', 'brch_shoe', 'brch_other', 
                      ]
data_dummies = pd.get_dummies(data[list_of_binary], dtype = int, drop_first = True)
data = pd.concat([data, data_dummies], axis = 1)

data_dummies = pd.get_dummies(data_validation[list_of_binary], dtype = int, drop_first = True)
data_validation = pd.concat([data_validation, data_dummies], axis = 1)

In [19]:
data = data.rename(columns = {"has_Annadale-Huguenot-Prince's Bay-Eltingville_nta_name": "has_Annadale-Huguenot-Prince Bay-Eltingville_nta_name"})
data_validation = data_validation.rename(columns = {"has_Annadale-Huguenot-Prince's Bay-Eltingville_nta_name": "has_Annadale-Huguenot-Prince Bay-Eltingville_nta_name"})

Отбираем релевантные поля и сохраняем подготовленные выборки в репозиторий:

In [26]:
list_to_select = ['haversine_manhattan_distance_to_poor_tree',
                  'tree_dbh_wrangled', 'problems_count', 'has_problem',
                  'has_BranchLights_problem',
                  'has_TrunkOther_problem', 'has_BranchOther_problem',
                  'has_RootOther_problem', 'has_WiresRope_problem',
                  'has_MetalGrates_problem', 'has_1or2_steward',
                  'has_3or4_steward', 'has_Helpful_guards',
                  'has_Harmful_guards', 'has_TreesCount Staff_user_type',
                  'has_Queens_borough', 'has_Brooklyn_borough',
                  'has_Staten Island_borough', 'has_Bronx_borough',
                  'has_Annadale-Huguenot-Prince Bay-Eltingville_nta_name',
                  'has_Great Kills_nta_name', 'has_East New York_nta_name',
                  'has_Ridgewood_nta_name', 'has_Bayside-Bayside Hills_nta_name',
                  'curb_loc_OnCurb', 'sidewalk_NoDamage', 'trnk_light_Yes', 'brch_shoe_Yes',
                  'spc_common_pca_component_1', 'spc_common_pca_component_2',
                  'spc_common_pca_component_3', 'spc_common_pca_component_4',
                  'spc_common_pca_component_5', 'spc_common_pca_component_6',
                  'spc_common_pca_component_7', 'spc_common_pca_component_8',
                  'spc_common_pca_component_9', 'spc_common_pca_component_10',
                  'problems_pca_component_2', 'problems_pca_component_3', 
                  'problems_pca_component_4', 'problems_pca_component_5', 
                  'address_pca_component_1',
                  'address_pca_component_2', 'address_pca_component_3',
                  'address_pca_component_4', 'address_pca_component_5',
                  'zip_city_pca_component_1', 'zip_city_pca_component_2',
                  'zip_city_pca_component_3', 'zip_city_pca_component_4',
                  'zip_city_pca_component_5', 
                  'nta_name_pca_component_1',
                  'nta_name_pca_component_2', 'nta_name_pca_component_3',
                  'nta_name_pca_component_4', 'nta_name_pca_component_5', 
                  'health']

In [27]:
data[list_to_select].to_csv(("\\data\\data_balanced_wrangled.csv"), index = False)
data_validation[list_to_select].to_csv(("data\\data_validation_wrangled.csv"), index = False)