In [115]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import nltk
from scipy import stats
from sklearn import datasets
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split 
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.ensemble import VotingClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest
from sklearn.model_selection import GridSearchCV
from sklearn.impute import KNNImputer
from sklearn.calibration import CalibratedClassifierCV
from nltk.classify.scikitlearn import SklearnClassifier
from sklearn.feature_extraction.text import CountVectorizer
import xgboost

In [116]:
X_train = pd.read_csv("data/train.csv")
X_test = pd.read_csv('data/test.csv')

In [117]:
vectorizer = CountVectorizer()

In [118]:
X_train.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,8,,,#RockyFire Update => California Hwy. 20 closed...,1
4,13,,,I'm on top of the hill and I can see a fire in...,1


In [119]:
X_train[X_train['target']==1]['text']

0       Our Deeds are the Reason of this #earthquake M...
1                  Forest fire near La Ronge Sask. Canada
2       All residents asked to 'shelter in place' are ...
3       #RockyFire Update => California Hwy. 20 closed...
4       I'm on top of the hill and I can see a fire in...
                              ...                        
5341    Suicide bomber kills 15 in Saudi security site...
5342    Two giant cranes holding a bridge collapse int...
5343    @aria_ahrary @TheTawniest The out of control w...
5344    Police investigating after an e-bike collided ...
5345    The Latest: More Homes Razed by Northern Calif...
Name: text, Length: 2291, dtype: object

In [120]:
X_train[X_train['target']==0]['text']

8                                          What's up man?
9                                           I love fruits
10                                       Summer is lovely
11                                      My car is so fast
12                                 this is ridiculous....
                              ...                        
5325    @widda16 ... He's gone. You can relax. I thoug...
5326     @jt_ruff23 @cameronhacker and I wrecked you both
5327    Three days off from work and they've pretty mu...
5328    @engineshed Great atmosphere at the British Li...
5329    Cramer: Iger's 3 words that wrecked Disney's s...
Name: text, Length: 3055, dtype: object

In [121]:
print('X_train.shape(sin procesar): ', X_train.shape)
print('X_test.shape(sin procesar): ', X_test.shape)

X_train.shape(sin procesar):  (5346, 5)
X_test.shape(sin procesar):  (2267, 4)


In [122]:
# Eliminamos los duplicados
X_train = X_train.drop_duplicates(subset='text')
print('X_train.shape(sin duplicados): ', X_train.shape)

X_train.shape(sin duplicados):  (5286, 5)


In [123]:
import re

signos = re.compile("(\.)|(\;)|(\:)|(\!)|(\?)|(\,)|(\")|(\()|(\))|(\[)|(\])|(\d+)|(\>)|(\=)|(\<)")
signos_arroba = re.compile("(\.)|(\;)|(\:)|(\!)|(\?)|(\,)|(\")|(\()|(\))|(\[)|(\])|(\d+)|(\>)|(\=)|(\<)|(\@)|(\#)")

def signs_tweets(tweet):
    return signos_arroba.sub('', tweet.lower())

X_train['text'] = X_train['text'].apply(signs_tweets)
X_train['text'].head()
X_test['text'] = X_test['text'].apply(signs_tweets)
X_test['text'].head()

0     people receive wildfires evacuation orders in...
1    just got sent this photo from ruby alaska as s...
2    flood disaster heavy rain causes flash floodin...
3    there's an emergency evacuation happening now ...
4    i'm afraid that the tornado is coming to our area
Name: text, dtype: object

In [124]:
X_train['text'] = X_train['text'].str.strip()
X_test['text'] = X_test['text'].str.strip()
print(X_train['text'].head())
print(X_test['text'].head())

0    our deeds are the reason of this earthquake ma...
1                forest fire near la ronge sask canada
2    all residents asked to 'shelter in place' are ...
3    rockyfire update  california hwy  closed in bo...
4    i'm on top of the hill and i can see a fire in...
Name: text, dtype: object
0    people receive wildfires evacuation orders in ...
1    just got sent this photo from ruby alaska as s...
2    flood disaster heavy rain causes flash floodin...
3    there's an emergency evacuation happening now ...
4    i'm afraid that the tornado is coming to our area
Name: text, dtype: object


In [125]:
def remove_links(df):
    return " ".join(['{link}' if ('http') in word else word for word in df.split()])

X_train['text'] = X_train['text'].apply(remove_links)
X_test['text'] = X_test['text'].apply(remove_links)

In [126]:
from nltk.corpus import stopwords

english_stopwords = stopwords.words('english')

def remove_stopwords(df):
    return " ".join([word for word in df.split() if word not in english_stopwords])

X_train['text'] = X_train['text'].apply(remove_stopwords)
X_train.head()
X_test['text'] = X_test['text'].apply(remove_stopwords)
X_test.head()

Unnamed: 0,id,keyword,location,text
0,6,,,people receive wildfires evacuation orders cal...
1,7,,,got sent photo ruby alaska smoke wildfires pou...
2,10,,,flood disaster heavy rain causes flash floodin...
3,14,,,there's emergency evacuation happening buildin...
4,15,,,i'm afraid tornado coming area


In [127]:
from nltk.stem.snowball import SnowballStemmer

def english_stemmer(x):
    stemmer = SnowballStemmer('english')
    return ' '.join([stemmer.stem(word) for word in x.split()])

X_train['text'] = X_train['text'].apply(english_stemmer)
X_test['text'] = X_test['text'].apply(english_stemmer)

In [128]:
'''nltk.download('wordnet')
def get_lemmatized_text(corpus):
    cadena = '' 
    from nltk.stem import WordNetLemmatizer
    lemmatizer = WordNetLemmatizer()
    for word in corpus.split():
        print(word)
        cadena = cadena + ' ' + lemmatizer.lemmatize(word)
    return cadena.strip()# Lematizamos las reviews
X_train['text'] = X_train['text'].apply(get_lemmatized_text)
X_test['text'] = X_test['text'].apply(get_lemmatized_text)'''

"nltk.download('wordnet')\ndef get_lemmatized_text(corpus):\n    cadena = '' \n    from nltk.stem import WordNetLemmatizer\n    lemmatizer = WordNetLemmatizer()\n    for word in corpus.split():\n        print(word)\n        cadena = cadena + ' ' + lemmatizer.lemmatize(word)\n    return cadena.strip()# Lematizamos las reviews\nX_train['text'] = X_train['text'].apply(get_lemmatized_text)\nX_test['text'] = X_test['text'].apply(get_lemmatized_text)"

In [129]:
X_train = X_train[['text', 'target']]

In [130]:
# Si solo es el modelo, no hará falta meterlo en un pipeline
rand_forest = Pipeline([
    ('vect', vectorizer),
    ('rand',RandomForestClassifier())
])

svm = Pipeline([
    ('vect', vectorizer),
    ("selectkbest",SelectKBest()),
    ("svm",SVC(probability=True))
])

reg_log = Pipeline([
    ('vect', vectorizer),
    ("imputer",SimpleImputer()),
    ("reglog",LogisticRegression())
])

gbc = Pipeline([
    ('vect', vectorizer),
    ('gbc',GradientBoostingClassifier())
])

xgboost = Pipeline([
    ('vect', vectorizer),
    ('xgboost',xgboost.XGBClassifier())
])

'''
Para iterar hiperparámetros de varios elementos del pipeline, le ponemos un nombre
a cada elemento en el pipeline, por ejemplo 'selectkbest' y 'svm', para luego en el
grid de hiperparametros identificar sus respectivos parametros mediante el nombre
que le hayamos puesto en el pipeline, dos guines bajos y el nombre del hiperparámetro.
'''

grid_random_forest = {
    'vect__max_df': (0.5, 1.9),
    'vect__min_df': (10, 20,50),
    'vect__max_features': (500, 1000),
    'vect__ngram_range': ((1, 1), (1, 2),(1,3)), # unigramas or bigramas
    "rand__n_estimators": [120],
    "rand__max_depth": [3,4,5,6,10,15,17],
    "rand__max_features": ["sqrt", 3, 4]
}

grid_gradient_boosting = {
    'vect__max_df': (0.5, 1.9),
    'vect__min_df': (10, 20,50),
    'vect__max_features': (500, 1000),
    'vect__ngram_range': ((1, 1), (1, 2),(1,3)), # unigramas or bigramas
    "gbc__loss": ["deviance"], # Deviance suele ir mejor.
    "gbc__learning_rate": [0.05, 0.1, 0.2, 0.4, 0.5], # Cuanto más alto, mas aporta cada nuevo arbol
    "gbc__n_estimators": [20,50,100,200], # Cuidado con poner muchos estiamdores ya que vamos a sobreajustar el modelo
    "gbc__max_depth": [1,2,3,4,5], # No es necesario poner una profundiad muy alta. Cada nuevo arbol va corrigiendo el error de los anteriores.
    "gbc__max_features": ["sqrt", 3, 4], # Igual que en el random forest
}

grid_xgboost = {
    'vect__max_df': (0.5, 1.9),
    'vect__min_df': (10, 20,50),
    'vect__max_features': (500, 1000),
    'vect__ngram_range': ((1, 1), (1, 2),(1,3)),# unigramas or bigramas
    "xgboost__learning_rate": [0.05, 0.1, 0.2, 0.4, 0.5],# Cuanto más alto, mas aporta cada nuevo arbol
    "xgboost__n_estimators": [20,50,100,200], # Cuidado con poner muchos estiamdores ya que vamos a
                                                # sobreajustar el modelo
    "xgboost__max_depth": [1,2,3,4,5] # No es necesario poner una profundiad muy alta. Cada nuevo
                                        # arbol va corrigiendo el error de los anteriores.
}

grid_xgboost2 = {
    'vect__max_df': (0.5, 1.9),
    'vect__min_df': (10, 20,50),
    'vect__max_features': (500, 1000),
    'vect__ngram_range': ((1, 1), (1, 2)),# unigramas or bigramas
    "xgboost__learning_rate": [0.2, 0.4, 0.5],# Cuanto más alto, mas aporta cada nuevo arbol
    "xgboost__n_estimators": [100,200], # Cuidado con poner muchos estiamdores ya que vamos a
                                                # sobreajustar el modelo
    "xgboost__max_depth": [3,4,5] # No es necesario poner una profundiad muy alta. Cada nuevo
                                        # arbol va corrigiendo el error de los anteriores.
}

svm_param = {
    'vect__max_df': (0.5, 1.9),
    'vect__min_df': (10, 20,50),
    'vect__max_features': (500, 1000),
    'vect__ngram_range': ((1, 1), (1, 2),(1,3)),# unigramas or bigramas
    'selectkbest__k': [1,2,3],
    'svm__C': [0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9],
    'svm__kernel': ["linear","poly","rbf"],
    'svm__coef0': [-10.,-1., 0., 0.1, 0.5, 1, 10, 100],
    'svm__gamma': ('scale', 'auto')
}

svm_param2 = {
    'vect__max_df': (0.5, 1.9),
    'vect__min_df': (10, 20,50),
    'vect__max_features': (500, 1000),
    'vect__ngram_range': ((1, 1), (1, 2)),# unigramas or bigramas
    'selectkbest__k': [1,2,3],
    'svm__C': [0.5, 0.6, 0.7, 0.8, 0.9],
    'svm__kernel': ["linear","poly","rbf"],
    'svm__coef0': [0., 0.1, 0.5, 1, 10, 100],
    'svm__gamma': ('scale', 'auto')
}

reg_log_param = {
    'vect__max_df': (0.5, 1.9),
    'vect__min_df': (10, 20,50),
    'vect__max_features': (500, 1000),
    'vect__ngram_range': ((1, 1), (1, 2),(1,3)),# unigramas or bigramas
    "imputer__strategy": ['mean', 'median', 'most_frequent'],
    "reglog__penalty": ["l1","l2"],
    "reglog__C": np.logspace(0, 4, 10)
}

reg_log_param2 = {
    'vect__max_df': (1.9, 2.5),
    'vect__min_df': (50, 75),
    'vect__max_features': (500, 1000),
    'vect__ngram_range': ((1, 1), (1, 2),(1,3)),# unigramas or bigramas
    "imputer__strategy": ['mean', 'median', 'most_frequent'],
    "reglog__penalty": ["l1","l2"],
    "reglog__C": np.logspace(0, 4, 10)
}

In [198]:
# Almaceno en una lista de tuplas los modelos (nombre que le pongo, el modelo, hiperparametros)
models = [
    ('rand_forest', rand_forest, grid_random_forest),
    ('svm', svm, svm_param),
    ('reg_log', reg_log, reg_log_param),
    ('gradient_boosting', gbc, grid_gradient_boosting),
    ('xgboost', xgboost, grid_xgboost)
]

model0 = [('rand_forest', rand_forest, grid_random_forest)]

model1 = [('svm', svm, svm_param)]
model1_1 = [('svm', svm, svm_param2)]

model2 = [('reg_log', reg_log, reg_log_param)]
model2_1 = [('reg_log', reg_log, reg_log_param2)]

model3 = [('gradient_boosting', gbc, grid_gradient_boosting)]

model4 = [('xgboost', xgboost, grid_xgboost)]
model4_1 = [('xgboost', xgboost, grid_xgboost2)]

# Declaro en un diccionario los pipelines e hiperparametros
models_gridsearch = {}

for i in model0:
    models_gridsearch[i[0]] = GridSearchCV( i[1],
                                            i[2],
                                            cv = 3,
                                            verbose =1,
                                            scoring = 'roc_auc',
                                            n_jobs = -1)
    
    models_gridsearch[i[0]].fit(X_train['text'], X_train['target'])

Fitting 3 folds for each of 756 candidates, totalling 2268 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    5.1s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:   18.3s
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:   40.7s
[Parallel(n_jobs=-1)]: Done 792 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 1242 tasks      | elapsed:  1.9min
[Parallel(n_jobs=-1)]: Done 1792 tasks      | elapsed:  2.8min
[Parallel(n_jobs=-1)]: Done 2268 out of 2268 | elapsed:  3.7min finished


In [199]:
best_grids = [(i, j.best_score_) for i, j in models_gridsearch.items()]

best_grids = pd.DataFrame(best_grids, columns=["Grid", "Best score"]).sort_values(by="Best score", ascending=False)
best_grids

Unnamed: 0,Grid,Best score
0,rand_forest,0.7406


0 	reg_log 	0.708547
[('reg_log',
  {'imputer__strategy': 'mean',
   'reglog__C': 1.0,
   'reglog__penalty': 'l2',
   'vect__max_df': 1.9,
   'vect__max_features': 500,
   'vect__min_df': 50,
   'vect__ngram_range': (1, 2)})]
   
   
reg_log 	0.710132 con snowball, sin espacios y sin arrobas y almohadillas   

rand_forest 	0.732476 con lemmanización, sin espacios y sin arrobas y almohadillas

rand_forest 	0.726502 con snowball, sin espacios y sin arrobas y almohadillas

In [194]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier

vectorizer2 = CountVectorizer(binary=True)

vectorizer2.fit(X_train['text'])
               
X_train_baseline = vectorizer2.transform(X_train['text'])
X_test_baseline = vectorizer2.transform(X_test['text'])
                
log_clf = LogisticRegression(C=1.0, penalty='l2')
rnd_clf = RandomForestClassifier(n_estimators=90, max_depth=2, max_features=2)
svm_clf = SVC(gamma='scale', probability=True, C=9.8, degree=0.0009)

estimators = [('lr', log_clf), ('rf', rnd_clf), ('svc', svm_clf)]


voting_clf = VotingClassifier(estimators = estimators,
                             voting='soft')

In [195]:
voting_clf.fit(X_train_baseline, X_train['target'])

VotingClassifier(estimators=[('lr', LogisticRegression()),
                             ('rf',
                              RandomForestClassifier(max_depth=2,
                                                     max_features=2,
                                                     n_estimators=90)),
                             ('svc',
                              SVC(C=9.8, degree=0.0009, probability=True))],
                 voting='soft')

In [196]:
predictions_proba = voting_clf.predict_proba(X_test_baseline)

In [197]:
from sklearn.metrics import accuracy_score

for clf in (log_clf, rnd_clf, svm_clf, voting_clf):
    clf.fit(X_train_baseline, X_train['target'])
    y_pred = clf.predict(X_train_baseline)
    print(clf.__class__.__name__, accuracy_score(X_train['target'], y_pred ))

LogisticRegression 0.9561104805145668
RandomForestClassifier 0.5741581536133182
SVC 0.9907302307983352
VotingClassifier 0.9903518728717366


In [200]:
best_estimator = [(i, j.best_estimator_) for i, j in models_gridsearch.items()]
best_estimator

[('rand_forest',
  Pipeline(steps=[('vect',
                   CountVectorizer(max_df=1.9, max_features=1000, min_df=10)),
                  ('rand',
                   RandomForestClassifier(max_depth=15, max_features=3,
                                          n_estimators=120))]))]

In [44]:
best_params = [(i, j.best_params_) for i, j in models_gridsearch.items()]
best_params

[('reg_log',
  {'imputer__strategy': 'mean',
   'reglog__C': 1.0,
   'reglog__penalty': 'l2',
   'vect__max_df': 1.9,
   'vect__max_features': 500,
   'vect__min_df': 50,
   'vect__ngram_range': (1, 2)})]

In [55]:
predictions_proba = models_gridsearch[i[0]].predict_proba(X_test['text'])

In [180]:
df_submission = X_test[['id']]

df_submission['target'] = predictions_proba[:,-1]

df_submission.to_csv('data/submission.csv', index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_submission['target'] = predictions_proba[:,-1]


In [58]:
df_submission.shape

(2247, 2)