# Practice

Мы поработаем с данными о сообществах в США. Описание датасета:

http://archive.ics.uci.edu/ml/datasets/communities+and+crime

Датасет на кэггле (в формате .csv):

https://www.kaggle.com/kkanda/communities%20and%20crime%20unnormalized%20data%20set

Будем предсказывать количество насильственных преступлений относительно численности населения.

In [None]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler

import warnings

warnings.filterwarnings('ignore')

In [None]:
from zipfile import ZipFile

z = ZipFile("archive.zip")

text_files = z.infolist()

for text_file in text_files:
    z.read(text_file.filename)

In [None]:
text_files[1]

<ZipInfo filename='crimedata.csv' compress_type=deflate file_size=1555523 compress_size=665855>

In [None]:
text_files[1].filename

'crimedata.csv'

In [None]:
with open("test.csv", 'wb') as f:
    f.write(z.read(text_files[1].filename)) # read method returns bytes
df = pd.read_csv('test.csv','utf-8')

UnicodeDecodeError: ignored

In [None]:
from chardet.universaldetector import UniversalDetector # Импортируем субмодуль chardet.universaldetector
detector = UniversalDetector()
with open('test.csv', 'rb') as fh:
    for line in fh:
        detector.feed(line)
        if detector.done:
            break
detector.close()

{'encoding': 'ISO-8859-1', 'confidence': 0.73, 'language': ''}

In [None]:
data = pd.read_csv("test.csv",encoding='ISO-8859-1', na_values=["?"])

# оставим лишь нужные колонки
requiredColumns = [5, 6] + list(range(11,26)) + list(range(32, 103)) + [145]
data = data[data.columns[requiredColumns]]

# некоторые значения целевой переменной пропущены
X = data.loc[data['ViolentCrimesPerPop'].notnull(), :].drop('ViolentCrimesPerPop', axis=1)
y = data['ViolentCrimesPerPop'][X.index]

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

In [None]:
data.head(2)

Unnamed: 0,population,householdsize,agePct12t21,agePct12t29,agePct16t24,agePct65up,numbUrban,pctUrban,medIncome,pctWWage,...,MedOwnCostPctInc,MedOwnCostPctIncNoMtg,NumInShelters,NumStreet,PctForeignBorn,PctBornSameState,PctSameHouse85,PctSameCity85,PctSameState85,ViolentCrimesPerPop
0,11980,3.1,12.47,21.44,10.93,11.33,11980,100.0,75122,89.24,...,21.1,14.0,11,0,10.66,53.72,65.29,78.09,89.14,41.02
1,23123,2.82,11.01,21.3,10.48,17.18,23123,100.0,47917,78.99,...,20.7,12.5,0,0,8.3,77.17,71.27,90.22,96.12,127.56


### 1 Baseline

Обучим линейную регрессию и выведем качество по метрике MSE на обучающей и тестовой выборке.

In [None]:
lr = LinearRegression().fit(X_train,y_train)
print ("Train: {}".format(mean_squared_error(y_train, lr.predict(X_train))))
print ("Test: {}".format(mean_squared_error(y_test, lr.predict(X_test))))

# Выведите ниже качество на обучении и тесте, рассчитаное с использованием функции
# MSE(), написанной вами ранее
# Сравните результаты
# YOUR CODE HERE

Train: 119935.90613769476
Test: 206978.88436767244


In [None]:
from sklearn.metrics import r2_score
print ("Train: {}".format(mean_squared_error(y_train, lr.predict(X_train))))
print ("Test: {}".format(mean_squared_error(y_test, lr.predict(X_test))))
print ("Train: {}".format(r2_score(y_train, lr.predict(X_train))))
print ("Test: {}".format(r2_score(y_test, lr.predict(X_test))))


Train: 0.6731208442568581
Test: 0.49554452282478767


Популярным решением для регрессионных моделей является **регуляризация**.

Во время оптимизации линейной регрессии, веса при переменных могут получится большими в абсолютных значениях. Это не очень хорошо, поскольку классификатор будет чувствителен к крайне маленьким изменениям в признаках объекта, а значит, переобучен.

Для решения проблемы к функционалу ошибки добавляют так называемсый регуляризатор, который "штрафует" модель за слишком большую норму вектора весов:


$$Q\alpha(w) = Q(w) + \alpha R(w)$$

где $R(w)$ - регуляризатор

Наиболее распространенными являются L1 и L2 регуляризаторы
$$L1: R(w) = ||w||_1 = \sum^d_i w_i^2$$

$$L2: R(w) = ||w||_2 = \sum^d_i |w_i|$$

Давайте применим каждый из них к нашей задаче и посмотрим на изменение в результатах.


В качестве метода регуляризации используем Ridge ($L_2$-регуляризация).

In [None]:
ridge = Ridge(1.0).fit(X_train,y_train)
print ("Train: {}".format(mean_squared_error(y_train, ridge.predict(X_train))))
print ("Test: {}".format(mean_squared_error(y_test, ridge.predict(X_test))))
print ("Train: {}".format(r2_score(y_train, ridge.predict(X_train))))
print ("Test: {}".format(r2_score(y_test, ridge.predict(X_test))))

Train: 120034.34899044895
Test: 206397.58450110914
Train: 0.6728525433148509
Test: 0.496961285227701


In [None]:
ridge = Ridge(0.5).fit(X_train,y_train)
print ("Train: {}".format(mean_squared_error(y_train, ridge.predict(X_train))))
print ("Test: {}".format(mean_squared_error(y_test, ridge.predict(X_test))))
print ("Train: {}".format(r2_score(y_train, ridge.predict(X_train))))
print ("Test: {}".format(r2_score(y_test, ridge.predict(X_test))))

Train: 119980.70658670652
Test: 206409.66591840147
Train: 0.6729987429327304
Test: 0.4969318400157231


In [None]:
ridge = Ridge(0.1).fit(X_train,y_train)
print ("Train: {}".format(mean_squared_error(y_train, ridge.predict(X_train))))
print ("Test: {}".format(mean_squared_error(y_test, ridge.predict(X_test))))
print ("Train: {}".format(r2_score(y_train, ridge.predict(X_train))))
print ("Test: {}".format(r2_score(y_test, ridge.predict(X_test))))

Train: 119940.59846478517
Test: 206721.20377398893
Train: 0.6731080555602353
Test: 0.4961725501099985


### 2 Scaling

Попробуем MinMaxScaler.

In [None]:
sc = MinMaxScaler()
X_train_scaled = pd.DataFrame(data=sc.fit_transform(X_train), columns=X_train.columns)
X_test_scaled = pd.DataFrame(data=sc.transform(X_test), columns=X_test.columns)

**Задание:** Напишите код обучения линейной регресии на масштабированных признаках и выведите ошибку на обучающей и валидационной выборке

In [None]:
# YOUR CODE HERE
lr = LinearRegression().fit(X_train_scaled,y_train)
print ("Train: {}".format(mean_squared_error(y_train, lr.predict(X_train_scaled))))
print ("Test: {}".format(mean_squared_error(y_test, lr.predict(X_test_scaled))))
print ("Train: {}".format(r2_score(y_train, lr.predict(X_train_scaled))))
print ("Test: {}".format(r2_score(y_test, lr.predict(X_test_scaled))))

Train: 119935.90613769476
Test: 206978.88437126437
Train: 0.6731208442568581
Test: 0.49554452281603323


**Задание:** проделайте аналогичную работу, добавив Ridge регуляризацию

In [None]:
# YOUR CODE HERE
ridge = Ridge(0.5).fit(X_train_scaled,y_train)
print ("Train: {}".format(mean_squared_error(y_train, ridge.predict(X_train_scaled))))
print ("Test: {}".format(mean_squared_error(y_test, ridge.predict(X_test_scaled))))
print ("Train: {}".format(r2_score(y_train, ridge.predict(X_train_scaled))))
print ("Test: {}".format(r2_score(y_test, ridge.predict(X_test_scaled))))


Train: 122984.86952707608
Test: 183859.0431778662
Train: 0.66481105104557
Test: 0.5518929303217623


### 3. High/low variance

Полезны ли признаки, имеющие высокую дисперсию? А низкую?

In [None]:
features_variance = X_train_scaled.var().sort_values(ascending=False)
features_variance.head()

pctUrban          0.197731
RentHighQ         0.063005
MedYrHousBuilt    0.054831
OwnOccHiQuart     0.048807
MedRent           0.046863
dtype: float64

Попробуем удалить признаки с самой низкой дисперсией и посмотреть, как изменится качество. В sklearn есть специальный инструмент для такого наивного отбора признаков. Стоит ли нормализовать перед этим признаки?

In [None]:
from sklearn.feature_selection import VarianceThreshold

In [None]:
# можно убрать все признаки, дисперсия которых меньше заданного значения
vs_transformer = VarianceThreshold(0.01)

X_train_var = pd.DataFrame(data=vs_transformer.fit_transform(X_train_scaled), columns=X_train_scaled.columns[vs_transformer.get_support()])
X_test_var = pd.DataFrame(data=vs_transformer.transform(X_test_scaled), columns=X_test_scaled.columns[vs_transformer.get_support()])

X_train_var.shape

(1495, 76)

In [None]:
lr = LinearRegression().fit(X_train_var,y_train)
print ("Train: {}".format(mean_squared_error(y_train, lr.predict(X_train_var))))
print ("Test: {}".format(mean_squared_error(y_test, lr.predict(X_test_var))))
print ("Train: {}".format(r2_score(y_train, lr.predict(X_train_var))))
print ("Test: {}".format(r2_score(y_test, lr.predict(X_test_var))))


Train: 125706.38916046257
Test: 149123.25580684407
Train: 0.6573936889832163
Test: 0.636552088896503


In [None]:
ridge = Ridge(5.0).fit(X_train_var,y_train)
print ("Train: {}".format(mean_squared_error(y_train, ridge.predict(X_train_var))))
print ("Test: {}".format(mean_squared_error(y_test, ridge.predict(X_test_var))))
print ("Train: {}".format(r2_score(y_train, ridge.predict(X_train_var))))
print ("Test: {}".format(r2_score(y_test, ridge.predict(X_test_var))))

Train: 136186.78301451617
Test: 152046.21566890882
Train: 0.6288299135035398
Test: 0.6294281587599264


### 4 Correlation

Можно выбрать k признаков, которые дают наиболее высокие значения корреляции с целевой переменной.

In [None]:
from sklearn.feature_selection import SelectKBest, f_regression

In [None]:
# Выбираем 15 лучших признаков
sb = SelectKBest(f_regression, k=15)

X_train_kbest = pd.DataFrame(data=sb.fit_transform(X_train_var, y_train), columns=X_train_var.columns[sb.get_support()])
X_test_kbest = pd.DataFrame(data=sb.transform(X_test_var), columns=X_test_var.columns[sb.get_support()])

In [None]:
X_train_kbest.shape

(1495, 15)

In [None]:
lr = LinearRegression().fit(X_train_kbest,y_train)
print ("Train: {}".format(mean_squared_error(y_train, lr.predict(X_train_kbest))))
print ("Test: {}".format(mean_squared_error(y_test, lr.predict(X_test_kbest))))
print ("Train: {}".format(r2_score(y_train, lr.predict(X_train_kbest))))
print ("Test: {}".format(r2_score(y_test, lr.predict(X_test_kbest))))


Train: 147378.18578795987
Test: 156005.7803589241
Train: 0.5983283197108951
Test: 0.6197777825816513


In [None]:
ridge = Ridge(0.5).fit(X_train_kbest,y_train)
print ("Train: {}".format(mean_squared_error(y_train, ridge.predict(X_train_kbest))))
print ("Test: {}".format(mean_squared_error(y_test, ridge.predict(X_test_kbest))))
print ("Train: {}".format(r2_score(y_train, ridge.predict(X_train_kbest))))
print ("Test: {}".format(r2_score(y_test, ridge.predict(X_test_kbest))))

Train: 150111.52271527727
Test: 154590.58194063304
Train: 0.5908787502205266
Test: 0.6232269488846662


А можно выбрать самые значимые признаки с точки зрения регрессии с $L_1$-регуляризацией.

In [None]:
from sklearn.feature_selection import SelectFromModel

In [None]:
#мы обучаем модель и выбираем те признаки которие оказались важныма с точки зрения Lasso
lasso = Lasso(5.0)
l1_select = SelectFromModel(lasso)

X_train_l1 = pd.DataFrame(data=l1_select.fit_transform(X_train_var, y_train), columns=X_train_var.columns[l1_select.get_support()])
X_test_l1 = pd.DataFrame(data=l1_select.transform(X_test_var), columns=X_test_var.columns[l1_select.get_support()])

X_train_l1.shape

(1495, 12)

In [None]:
lr = LinearRegression().fit(X_train_l1,y_train)
print ("Train: {}".format(mean_squared_error(y_train, lr.predict(X_train_l1))))
print ("Test: {}".format(mean_squared_error(y_test, lr.predict(X_test_l1))))
print ("Train: {}".format(r2_score(y_train, lr.predict(X_train_l1))))
print ("Test: {}".format(r2_score(y_test, lr.predict(X_test_l1))))

print("___")

ridge = Ridge(0.5).fit(X_train_l1,y_train)
print ("Train: {}".format(mean_squared_error(y_train, ridge.predict(X_train_l1))))
print ("Test: {}".format(mean_squared_error(y_test, ridge.predict(X_test_l1))))
print ("Train: {}".format(r2_score(y_train, ridge.predict(X_train_l1))))
print ("Test: {}".format(r2_score(y_test, ridge.predict(X_test_l1))))


Train: 140757.45879349473
Test: 153086.92726760302
Train: 0.6163727712854883
Test: 0.626891703566804
___
Train: 140877.1845197516
Test: 153217.0887619086
Train: 0.6160464649642217
Test: 0.6265744698599921


### 5 Pipeline

А можно сделать все вышеописанное сразу:

In [None]:
from sklearn.pipeline import Pipeline


pipe = Pipeline(steps=[
    ('scaler', MinMaxScaler()),
    ('variance', VarianceThreshold(0.01)),
    ('selection', SelectFromModel(Lasso(5.0))),
    ('regressor', Ridge(0.5))
])

pipe.fit(X_train, y_train)

pipe.named_steps

{'scaler': MinMaxScaler(),
 'variance': VarianceThreshold(threshold=0.01),
 'selection': SelectFromModel(estimator=Lasso(alpha=5.0)),
 'regressor': Ridge(alpha=0.5)}

In [None]:
print ("Train: {}".format(mean_squared_error(y_train, pipe.predict(X_train))))
print ("Test: {}".format(mean_squared_error(y_test, pipe.predict(X_test))))
print ("Train: {}".format(r2_score(y_train, pipe.predict(X_train))))
print ("Test: {}".format(r2_score(y_test, pipe.predict(X_test))))

Train: 140877.1845197516
Test: 153217.0887619086
Train: 0.6160464649642217
Test: 0.6265744698599921


Можно также настраивать параметры с помощью `GridSearch`:

In [None]:
pipe.get_params()

{'memory': None,
 'steps': [('scaler', MinMaxScaler()),
  ('variance', VarianceThreshold(threshold=0.01)),
  ('selection', SelectFromModel(estimator=Lasso(alpha=5.0))),
  ('regressor', Ridge(alpha=0.5))],
 'verbose': False,
 'scaler': MinMaxScaler(),
 'variance': VarianceThreshold(threshold=0.01),
 'selection': SelectFromModel(estimator=Lasso(alpha=5.0)),
 'regressor': Ridge(alpha=0.5),
 'scaler__clip': False,
 'scaler__copy': True,
 'scaler__feature_range': (0, 1),
 'variance__threshold': 0.01,
 'selection__estimator__alpha': 5.0,
 'selection__estimator__copy_X': True,
 'selection__estimator__fit_intercept': True,
 'selection__estimator__max_iter': 1000,
 'selection__estimator__positive': False,
 'selection__estimator__precompute': False,
 'selection__estimator__random_state': None,
 'selection__estimator__selection': 'cyclic',
 'selection__estimator__tol': 0.0001,
 'selection__estimator__warm_start': False,
 'selection__estimator': Lasso(alpha=5.0),
 'selection__importance_getter': '

In [None]:
param_grid = {
    'variance__threshold': [0.005, 0.0075, 0.009, 0.01, 0.011, 0.012],
    'selection__estimator__alpha': [0.1, 0.5, 1.0, 1.5, 2.0, 5.0, 10.0],
    'regressor__alpha': [0.1, 0.5, 1.0, 1.5, 2.0, 5.0, 10.0]
}
grid_search = GridSearchCV(pipe, param_grid, cv=5)

grid_search.fit(X_train, y_train)

In [None]:
pipe_best = grid_search.best_estimator_
pipe_best.named_steps

{'scaler': MinMaxScaler(),
 'variance': VarianceThreshold(threshold=0.01),
 'selection': SelectFromModel(estimator=Lasso(alpha=0.1)),
 'regressor': Ridge(alpha=0.5)}

In [None]:
pipe_best.fit(X_train, y_train)
print ("Train: {}".format(mean_squared_error(y_train, pipe_best.predict(X_train))))
print ("Test: {}".format(mean_squared_error(y_test, pipe_best.predict(X_test))))
print ("Train: {}".format(r2_score(y_train, pipe_best.predict(X_train))))
print ("Test: {}".format(r2_score(y_test, pipe_best.predict(X_test))))

Train: 128441.16453337156
Test: 147186.92249142198
Train: 0.6499401990832248
Test: 0.6412713816378165
