# Урок 9. Кейс 2. Внедрение модели в продукцию
Курсовой проект
Часть 1 Создание и тренировка модели

In [1]:
import pandas as pd
import dill
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix,accuracy_score,classification_report
from sklearn.metrics import roc_auc_score,roc_curve,scorer
from sklearn.metrics import f1_score
#working with text
from sklearn.feature_extraction.text import TfidfVectorizer
#normalizing data
from sklearn.preprocessing import StandardScaler
#pipeline
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.metrics import precision_score,recall_score
#imputer
from sklearn.impute import SimpleImputer

import sklearn.datasets



In [2]:
#Read the data From Dataset
fake_news = pd.read_csv("data/data.csv")

In [15]:
fake_news.head(10)

Unnamed: 0,Headline,Body,Label
0,Four ways Bob Corker skewered Donald Trump,Image copyright Getty Images\nOn Sunday mornin...,1
1,Linklater's war veteran comedy speaks to moder...,"LONDON (Reuters) - “Last Flag Flying”, a comed...",1
2,Trump’s Fight With Corker Jeopardizes His Legi...,The feud broke into public view last week when...,1
3,Egypt's Cheiron wins tie-up with Pemex for Mex...,MEXICO CITY (Reuters) - Egypt’s Cheiron Holdin...,1
4,Jason Aldean opens 'SNL' with Vegas tribute,"Country singer Jason Aldean, who was performin...",1
5,JetNation FanDuel League; Week 4,JetNation FanDuel League; Week 4\n% of readers...,0
6,Kansas Tried a Tax Plan Similar to Trump’s. It...,"In 2012, Kansas lawmakers, led by Gov. Sam Bro...",1
7,"India RBI chief: growth important, but not at ...",The Reserve Bank of India (RBI) Governor Urjit...,1
8,EPA chief to sign rule on Clean Power Plan exi...,"Scott Pruitt, Administrator of the U.S. Enviro...",1
9,Talks on sale of Air Berlin planes to easyJet ...,FILE PHOTO - An Air Berlin sign is seen at an ...,1


In [4]:
fake_news=  fake_news.drop(['URLs'], axis=1)
fake_news = fake_news.dropna()


In [5]:
fake_news.head()

Unnamed: 0,Headline,Body,Label
0,Four ways Bob Corker skewered Donald Trump,Image copyright Getty Images\nOn Sunday mornin...,1
1,Linklater's war veteran comedy speaks to moder...,"LONDON (Reuters) - “Last Flag Flying”, a comed...",1
2,Trump’s Fight With Corker Jeopardizes His Legi...,The feud broke into public view last week when...,1
3,Egypt's Cheiron wins tie-up with Pemex for Mex...,MEXICO CITY (Reuters) - Egypt’s Cheiron Holdin...,1
4,Jason Aldean opens 'SNL' with Vegas tribute,"Country singer Jason Aldean, who was performin...",1


In [6]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(fake_news[['Headline','Body']], 
                                                    fake_news['Label'], test_size=0.33, random_state=42)
#save test
X_test.to_csv("X_test.csv", index=None)
y_test.to_csv("y_test.csv", index=None)
#save train
X_train.to_csv("X_train.csv", index=None)
y_train.to_csv("y_train.csv", index=None)

In [7]:
class ColumnSelector(BaseEstimator, TransformerMixin):
    """
    Transformer to select a single column from the data frame to perform additional transformations on
    """
    def __init__(self, key):
        self.key = key

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[self.key]
    
class TextImputer(BaseEstimator, TransformerMixin):
    def __init__(self, key, value):
        self.key = key
        self.value = value
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        X[self.key] = X[self.key].fillna(self.value)
        return X

In [8]:
#combine
Headline = Pipeline([
                ('imputer', TextImputer('Headline', '')),
                ('selector', ColumnSelector(key='Headline')),
                ('tfidf', TfidfVectorizer(max_df=0.9, min_df=10))
            ])
Body = Pipeline([
                ('imputer', TextImputer('Body', '')),
                ('selector', ColumnSelector(key='Body')),
                ('tfidf', TfidfVectorizer(max_df=0.9, min_df=10))
            ])


feats = FeatureUnion([('Headline', Headline),
                      ('Body', Body)])

In [9]:
%%time

from sklearn.ensemble import RandomForestClassifier

pipeline = Pipeline([
    ('features',feats),
    ('classifier', RandomForestClassifier( random_state=42)),
])

pipeline.fit(X_train, y_train)

y_pred = pipeline.predict(X_test)

Wall time: 2.55 s


In [10]:
from sklearn.metrics import roc_auc_score,roc_curve,scorer

In [11]:
y_pred

array([0, 1, 1, ..., 1, 1, 0], dtype=int64)

In [12]:
roc_auc_score(y_score=y_pred, y_true=y_test)

0.9788668876116962

In [13]:
pipeline.steps

[('features', FeatureUnion(transformer_list=[('Headline',
                                  Pipeline(steps=[('imputer',
                                                   TextImputer(key='Headline',
                                                               value='')),
                                                  ('selector',
                                                   ColumnSelector(key='Headline')),
                                                  ('tfidf',
                                                   TfidfVectorizer(max_df=0.9,
                                                                   min_df=10))])),
                                 ('Body',
                                  Pipeline(steps=[('imputer',
                                                   TextImputer(key='Body',
                                                               value='')),
                                                  ('selector',
                                        

In [14]:
with open("fake_news.dill", "wb") as f:
    dill.dump(pipeline, f)