In [2]:
from textblob import TextBlob
import pandas as pd
import numpy as np
import re
import matplotlib as plt
import json
import datetime as dt

In [3]:
import os
import tweepy as tw

consumer_key = "REDACTED" 
consumer_secret = "REDACTED"
access_key = "REDACTED"
access_secret = "REDACTED"

# Authentification :

auth = tw.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_key, access_secret)
api = tw.API(auth, wait_on_rate_limit=True)

requete = "Tesla OR Elon OR Musk"


tweets = tw.Cursor(api.search,
                   q = requete,
                   lang = "en",
                   ).items(50)

all_tweets = [tweet.text for tweet in tweets]

all_tweets = pd.DataFrame(t.__dict__ for t in tweets)




In [7]:
import pandas as pd
from scipy.stats import uniform, geom, loguniform, randint, expon
from sklearn import ensemble, neighbors, tree, linear_model, svm, naive_bayes, gaussian_process, feature_selection, preprocessing, impute, metrics, decomposition, compose
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import roc_auc_score
from sklearn.pipeline import make_pipeline, Pipeline as Pipeline
import itertools
import random
import pickle
TEST_SIZE = 0.1
RANDOM_STATE = 10


In [9]:
N_ITER = 1
N_JOBS = 1
K_FOLDS = 10
VERBOSE = 0
SCORING_METRIC = 'roc_auc'
SCORING_FUNCTION = metrics.roc_auc_score
MAX_ITER = 10000
LOGISTIC_REGRESSION_SOLVER = 'sag'
OPTIMAL_MODEL_FILENAME = 'optimal_model.pickle'
categorical_feature_names = ['categorical']
best_score = 0

In [10]:
models = [
    (ensemble.RandomForestClassifier(),{
    'model__n_estimators': randint(50,500),
    'model__max_depth': randint(3,10),
    'model__max_features': ['sqrt'],
    'model__min_samples_split': randint(2,20),
    'model__min_samples_leaf': randint(1,10),
    'model__criterion': ['gini', 'entropy'],
    'model__ccp_alpha': loguniform(0.1e-5, 0.1e-0)
    }, 1),
    
    (neighbors.KNeighborsClassifier(),{
    'model__n_neighbors': randint(3,20),
    'model__weights': ['uniform', 'distance'],
    'model__algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute']
    }, 1),
    
    (svm.SVC(probability=True, max_iter=MAX_ITER),{
    'model__C': loguniform(3e-4, 3e-1),
    'model__kernel': ['linear', 'poly', 'rbf', 'sigmoid']
    }, 1),
    (linear_model.LogisticRegression(solver=LOGISTIC_REGRESSION_SOLVER,
    max_iter=MAX_ITER),{
    'model__C': loguniform(3e-4, 3e-1),
    'model__penalty': ['none', 'l2'],
    'model__class_weight': ['balanced', None]
    }, 1),
   
    (naive_bayes.GaussianNB(),{}, 1),
]
feature_selectors = [
    (feature_selection.SelectFromModel(linear_model.LogisticRegression()),{
    'feature_selection__estimator__penalty': ['l2'],}),
    (decomposition.PCA(),{'feature_selection__n_components': randint(2, 5),}),
]
scalers = [
    (preprocessing.MinMaxScaler(),{}),
    (preprocessing.RobustScaler(),{'scaler__quantile_range': [(25.0, 75.0),
    (10.0, 90.0)] })
]
imputation = [
    (impute.SimpleImputer(),
    {'imputer__strategy': ['mean', 'median']})
]
transformers = [
    (preprocessing.OneHotEncoder(),
    {'column_transformer__transformer__drop': ['first', 'if_binary', None]})
]
hyparameters = list(
    itertools.product(
    transformers, imputation, scalers, feature_selectors, models
))

In [11]:
for transformer_params, imputer_params, scaler_params, feature_selection_params, model_params in hyparameters:
    hyperparameter_dict = {
        **transformer_params[1],
        **imputer_params[1],
        **scaler_params[1],
        **feature_selection_params[1],
        **model_params[1]
    }
    column_transformer = compose.ColumnTransformer(
        [('transformer', transformer_params[0],categorical_feature_names)],
        remainder="passthrough"
    )
    pipe = Pipeline(steps=[
        ('column_transformer', column_transformer),
        ('scaler', scaler_params[0]),
        ('imputer', imputer_params[0]),
        ('feature_selection', feature_selection_params[0]),
        ('model', model_params[0])
    ])
    optimal_model = RandomizedSearchCV(
        pipe, hyperparameter_dict,
        n_iter = model_params[2], cv=K_FOLDS,
        scoring=SCORING_METRIC, n_jobs = N_JOBS,
        return_train_score=True, verbose = VERBOSE
    )
    optimal_model.fit(X_train, y_train)
    
    y_pred = optimal_model.best_estimator_.predict(X_train)
    y_pred_prob = optimal_model.best_estimator_.predict_proba(X_train)[:,1]
    y_pred_test = optimal_model.best_estimator_.predict_proba(X_test)[:,1]
    score = SCORING_FUNCTION(y_test, y_pred_test)
    print(
        'Optimal Training Score: ', optimal_model.cv_results_['mean_train_score'][optimal_model.best_index_],
        '\Optimal Test Score: ', optimal_model.best_score_,
        '\nHold Out Test Score: ', score
    )
    if score > best_score:
        best_score = score
        pickle.dump(optimal_model, open(OPTIMAL_MODEL_FILENAME, 'wb'))

NameError: name 'X_train' is not defined