In [221]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_extraction import DictVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegressionCV
from sklearn.preprocessing import StandardScaler, Imputer
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.pipeline import make_pipeline, FeatureUnion
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from category_encoders import OneHotEncoder
from math import sqrt

%matplotlib inline

<h2>Загрузка данных</h2>

In [222]:
df_train = pd.read_csv('data/train_knn.csv')
df_test = pd.read_csv('data/test_knn.csv')

Отобранные фичи

In [223]:
some_features = ['Cash','Downloads','startYear','numVotes','runtimeMinutes','worldPromotion']
Y = df_train['rating']

<h1>Пробую писать свой GenreEncoder чтобы засунуть в Pipeline</h1>

In [224]:
class Selector(BaseEstimator, TransformerMixin):
    def __init__(self, keys = ['titleType']):
        self._ = keys
        
    def fit(self, df, y=None):
        return self
    
    def transform(self, df, y=None):
        return df[self._]
    
class GenreEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, namecolumn):
        self.genres = []
        self._namecolumn = namecolumn
        
    def fit(self, df, y=None):
        samples = df[self._namecolumn]
        for row in samples:
            row = row.replace('nan', '"nan"')
            row = row.replace('None', '"None"')

            for genre in eval(row):
                if genre not in self.genres:
                    self.genres.append(genre)
        return self
    
    def transform(self, df, y=None):
        genre_rows = []
        samples = df[self._namecolumn]
        res_df = pd.DataFrame(columns=self.genres)
        
        for row in samples:
            temp = [0] * len(self.genres)
            
            row = row.replace('nan', '"nan"')
            row = row.replace('None', '"None"')
            
            for genre in eval(row):
                temp[self.genres.index(genre)] = 1
            
            genre_rows.append(temp)
            res = pd.DataFrame(genre_rows, columns = self.genres)
            res.drop(columns=['nan'], inplace=True)
        return res
    
#class TargetEncoder(BaseEstimator, TransformerMixin):

estKNN = KNeighborsClassifier(n_neighbors=10)
standardScaler = StandardScaler()
genreEncoder = GenreEncoder('genres')
typeOneHot = Pipeline([('select_types', Selector()), ('onehot',OneHotEncoder())])

<h2> Выделение в один Pipeline </h2>

In [235]:
categoryPipe = FeatureUnion([('typeOneHot', typeOneHot), ('genreEncoder', genreEncoder)])
numberPipe = Pipeline([('numFeatureSelection', Selector(some_features)), ('fillNa', Imputer(strategy='mean')), ('standardization', standardScaler)])
preprocessedData = FeatureUnion([('catP', categoryPipe), ('numPipe', numberPipe)])

workFlowKNN = Pipeline([('featuresHandle', preprocessedData), ('estimatorKNN', estKNN)])


<h3>Вспомогательная функция для predict(a)</h3>

In [236]:
def getPrediction(df_train, df_test, target_train, estimator):
    estimator.fit(df_train, target_train)
    return pd.DataFrame({'Id':df_test['Id'],'class': estimator.predict(df_test)})

In [232]:
sub = getPrediction(df_train, df_test, Y, workFlowLR)
#sub.to_csv('second_with_cosine_metric.csv', index=False)

# L1 REGULARIZATION

In [237]:
df_train_preprocessed = preprocessedData.fit_transform(df_train)

In [245]:
df_train_preprocessed = pd.DataFrame(df_train_preprocessed)
print(df_train_preprocessed.head())
print(Y)

    0    1    2    3    4    5    6    7    8    9     ...      36   37   38  \
0  1.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0    ...     0.0  0.0  0.0   
1  0.0  1.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0    ...     0.0  0.0  0.0   
2  1.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0    ...     0.0  0.0  0.0   
3  0.0  0.0  1.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0    ...     0.0  0.0  0.0   
4  0.0  1.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0    ...     0.0  0.0  0.0   

    39        40        41        42        43        44        45  
0  0.0  0.166018  1.097697  0.570449 -0.074103 -0.953798  1.013086  
1  0.0 -0.528980 -0.451967  0.570449  1.161443  1.432522 -0.987083  
2  0.0 -0.209273  1.061106  0.617784 -0.076654 -1.382799  1.013086  
3  0.0  1.352907  0.093152  0.144441 -0.076559  0.000000  1.013086  
4  0.0 -1.419230  0.163520 -0.518240 -0.076182  0.000000 -0.987083  

[5 rows x 46 columns]
0       6
1       8
2       9
3       6
4       7
5       4
6       9
7       8
8 

In [262]:
#l1_cv = LogisticRegressionCV(penalty='l1', cv=2, solver='liblinear', scoring='accuracy').fit(df_train_preprocessed, Y)
print([list(Y).count(i) for i in range(1,11)])

[1, 9, 29, 74, 155, 393, 534, 627, 164, 14]


# DEBUG

In [193]:
tempUnion.fit_transform(df_train)
#genreEncoder.fit_transform(df_train)
#print(df_train.head())
#one = LabelBinarizer()
#one.fit(Selector().fit_transform(df_train))
#print(one.fit_transform(Selector().fit_transform(df_train)))
#typeOneHot.fit(df_train)
#print(1)

array([[ 1.        ,  0.        ,  0.        , ..., -0.07410313,
        -0.90328413,  1.01308561],
       [ 0.        ,  1.        ,  0.        , ...,  1.1614434 ,
         1.47290112, -0.98708341],
       [ 1.        ,  0.        ,  0.        , ..., -0.07665396,
        -1.3304635 ,  1.01308561],
       ...,
       [ 0.        ,  0.        ,  0.        , ..., -0.07618158,
        -0.3426112 ,  1.01308561],
       [ 0.        ,  0.        ,  0.        , ..., -0.06739539,
        -0.23581636, -0.98708341],
       [ 0.        ,  1.        ,  0.        , ...,  0.01678201,
         1.23261272,  1.01308561]])

In [132]:
e = 0
f = 0
n = int(input())
for i in range(n):
    a = int(input())
    f = max(e,a)
    e = f
print(e)

3
88
22
2
88
