# Data Collection and Retrieval
Data Collection and Retrieval (same requirements as the individual data retrieval project):
* Topic: anything you choose, but it must include the following data types:
    * numeric features
    * categorical features (to be dummy coded)
    * text features (to be processed using text analytics)
    * image features (to be processed using image classification) 
    * labels to choose from (i.e. outcomes that you want to predict with the other features)
* Scrape approximately ~500 records; +/- 50

In [None]:
!pip install pyLDAvis
!pip install pyLDAvis.gensim
!pip install logging
!pip install nltk
!pip install -U pip setuptools wheel
!pip install -U spacy
!python -m spacy download en_core_web_sm

In [None]:
import requests
import time
import json
import pandas as pd

def get_tweets(bearer_token):
    headers = {'Authorization': ('Bearer ' + bearer_token)}

    n = 525                               # The total number of tweets we want
    max_results = 10 # The number of tweets to pull per request; must be between 10 and 100
    total_retrieved = 0                   # To keep track of when to stop
    next_token = ""                       # Must be empty on first iteration
    search_term = "manchester%20united"

    # Create empty DataFrames and set columns
    df_tweets = pd.DataFrame(columns=['tweet_id', 'author_id', 'retweet_count', 'like_count',
                                      'text', 'language', 'created_at', 'source', 'possibly_sensitive', 'image_url'])
    df_users = pd.DataFrame(columns=['user_id', 'username', 'created_at', 'description', 'profile_image_url',
                            'protected', 'verified', 'followers_count', 'following_count', 'tweet_count', 'listed_count'])

    # stop when we have n results
    while total_retrieved < n:

        # the first time through the loop, we do not need the next_token parameter
        if next_token == "":
            url = f'https://api.twitter.com/2/tweets/search/recent?query={search_term}&max_results={max_results}'
        else:
            url = f'https://api.twitter.com/2/tweets/search/recent?query={search_term}&max_results={max_results}&next_token={next_token}'

        # These are the extra parameters we will add to the querystring; we won't store them all though; just want you to see what's possible
        url += f'&expansions=geo.place_id,author_id,attachments.media_keys'
        url += f'&tweet.fields=attachments,author_id,context_annotations,conversation_id,created_at,entities,geo,id,in_reply_to_user_id,lang,possibly_sensitive,public_metrics,referenced_tweets,reply_settings,source,text,withheld'
        url += f'&media.fields=media_key,type,url&user.fields=created_at,description,entities,id,location,name,pinned_tweet_id,profile_image_url,protected,public_metrics,url,username,verified,withheld'

        # make the request to the Twitter API Recent Search endpoint
        response = requests.request("GET", url, headers=headers)
        try:  # Just in case we get an error
            json_data = json.loads(response.text)
            # print(json_data)
        except:
            print(response.text)

        for tweet in json_data['data']:
            media_key = ""  # Reset to empty each time through the loop so that we can use it for a condition later

            # Store the data into variables
            tweet_id = tweet['id']
            author_id = tweet['author_id']
            retweet_count = tweet['public_metrics']['retweet_count']    # label
            like_count = tweet['public_metrics']['like_count']          # label
            image_url = ""                                              # image
            text = tweet['text']                                        # text
            created_at = tweet['created_at']                            # categorical
            source = tweet['source']                                    # categorical
            possibly_sensitive = tweet['possibly_sensitive']            # categorical
            language = tweet['lang']                                    # categorical

            # Find out if there is media
            if 'attachments' in tweet:
                if 'media_keys' in tweet['attachments']:
                    media_key = tweet['attachments']['media_keys'][0]

            # If there is a media key in this tweet, iterate through tweet['includes']['media'] until we find it
            if media_key != "":
                for media in json_data['includes']['media']:
                    # Only if the media_key matches the one we stored
                    if media['media_key'] == media_key:
                        if media['type'] == 'photo':      # Only if it is a photo; ignore videos
                            # Store the url in a variable
                            image_url = media['url']

            # Add the new data to a new record in the DataFrame
            df_tweets.loc[tweet_id] = [tweet_id, author_id, retweet_count, like_count,
                                       text, language, created_at, source, possibly_sensitive, image_url]

        # keep track of how many results have been obtained so far:
        total_retrieved += 10
        print(f'{total_retrieved} tweets retrieved')

        # keep track of where to start next time, but quit if there are no more results
        try:
            next_token = json_data['meta']['next_token']
        except:
            break

        # get user info
        for user in json_data['includes']['users']:
            user_id = user['id']
            user_name = user['username']
            user_created_at = user['created_at']
            user_description = user['description']
            user_profile_image_url = user['profile_image_url']
            user_protected = user['protected']
            user_verified = user['verified']
            user_followers_count = user['public_metrics']['followers_count']
            user_following_count = user['public_metrics']['following_count']
            user_tweet_count = user['public_metrics']['tweet_count']
            user_listed_count = user['public_metrics']['listed_count']

            # put user info into a user dataframe
            df_users.loc[user_id] = [user_id, user_name, user_created_at, user_description, user_profile_image_url,
                                     user_protected, user_verified, user_followers_count, user_following_count, user_tweet_count, user_listed_count]

        # sleep to avoid hitting the rate limit
        time.sleep(8)

    # All done! save the dataframes to csv files
    df_tweets.to_csv('tweets.csv')
    df_users.to_csv('users.csv')
    print('Got all the tweets!')
    return df

In [None]:
df = get_tweets(bearer_token='AAAAAAAAAAAAAAAAAAAAAGnaTwEAAAAAhRdM6yLmei6skyaWcjbx8IDFnlw%3DLPQHO2CTw1nVjjHLx3htgP9qmeCOgPpt96EdDujokNcWljI5iP')

# Feature Cleaning and Engineering
* The cleaning steps should include any basic steps needed to prepare the data including:
    * Binning rare group values
    * Standardizing values
    * Adjusting for skewness
    * Handle missing values
* These steps will be unique to each dataset
* Engineering includes converting the unstructured text and images features into usable features. This could include:
    * Topic moding text
    * Extracting characteristics of the text (e.g. word counts, sentiment)
    * Extracting characteristics of the images (e..g number of faces, smiles)

In [14]:
# Bin rare group values
def bin_groups(df, percent=.05, cols_to_exclude=[]):
    import pandas as pd
    for col in df:
        if col not in cols_to_exclude:
            if not pd.api.types.is_numeric_dtype(df[col]):
                for group, count in df[col].value_counts().iteritems():
                    if count / len(df) < percent:
                        df.loc[df[col] == group, col] = 'Other'
    return df

#Standardize data
def standardize_values(df):
    import pandas as pd
    import numpy as np
    from sklearn import preprocessing

    # Scale/normalize the features
    df = pd.DataFrame(preprocessing.MinMaxScaler().fit_transform(df), columns=df.columns)
    
    return df

# Dummy code categorical variables
def dummy_code_categorical_variables(df):
    import pandas as pd

    for col in df:
        if not pd.api.types.is_numeric_dtype(df[col]):
            df = df.join(pd.get_dummies(df[col], prefix=col, drop_first=True, lsuffix='_left', rsuffix='_right'))

    return df

# Handle missing values
def drop_columns_missing_data(df, cutoff=.5):
    import pandas as pd
    for col in df:
        if df[col].isna().sum() / len(df) > cutoff:
            df.drop(columns=[col], inplace=True)
    return df

# Handle missing values
def impute_mean(df):
    from sklearn.impute import SimpleImputer
    import pandas as pd
    import numpy as np
    for col in df:
        if not pd.api.types.is_numeric_dtype(df[col]):
            df = pd.get_dummies(df, columns=[col], drop_first=True)
    imp = SimpleImputer(missing_values=np.nan, strategy='mean')
    df = pd.DataFrame(imp.fit_transform(df), columns=df.columns)
    return df

# Handle missing values AND standardize values
def impute_KNN(df):
    from sklearn.impute import KNNImputer
    from sklearn.preprocessing import MinMaxScaler
    import pandas as pd
    for col in df:
        if not pd.api.types.is_numeric_dtype(df[col]):
            df = pd.get_dummies(df, columns=[col], drop_first=True)
    df = pd.DataFrame(MinMaxScaler().fit_transform(df), columns=df.columns)
    imp = KNNImputer(n_neighbors=5, weights="uniform")
    df = pd.DataFrame(imp.fit_transform(df), columns=df.columns)
    return df

# Handle missing values
def impute_reg(df):
    from sklearn.experimental import enable_iterative_imputer
    from sklearn.impute import IterativeImputer
    import pandas as pd
    for col in df:
        if not pd.api.types.is_numeric_dtype(df[col]):
            df = pd.get_dummies(df, columns=[col], drop_first=True)
    imp = IterativeImputer(max_iter=10, random_state=12345)
    df = pd.DataFrame(imp.fit_transform(df), columns=df.columns)
    return df


def fit_mlr(df, test_size=.2, random_state=12345, label=''):
    from sklearn.linear_model import LinearRegression
    from sklearn.model_selection import train_test_split
    import pandas as pd
    X = df.drop(label, axis=1)
    y = df[label]
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size, random_state=random_state)
    model = LinearRegression().fit(X_train, y_train)
    print(f'R-squared (mlr): \t{model.score(X_test, y_test)}')
    return model


def fit_crossvalidate_mlr(df, k, label, repeat=True):
    from sklearn.linear_model import LinearRegression
    from sklearn.model_selection import KFold, RepeatedKFold, cross_val_score
    import pandas as pd
    from numpy import mean, std
    X = df.drop(label, axis=1)
    y = df[label]
    if repeat:
        cv = RepeatedKFold(n_splits=10, n_repeats=5, random_state=12345)
    else:
        cv = KFold(n_splits=10, random_state=12345, shuffle=True)
    scores = cross_val_score(LinearRegression(), X, y,
                             scoring='r2', cv=cv, n_jobs=-1)
    print(f'Average R-squared:\t{mean(scores)}')
    return LinearRegression().fit(X, y)

def calc_sentiment(df):
    import pandas as pd
    import nltk
    from nltk.sentiment import SentimentIntensityAnalyzer

    nltk.download('vader_lexicon')
    sia = SentimentIntensityAnalyzer()
    df['sentiment_overall'] = 0.0
    df['sentiment_negative'] = 0.0
    df['sentiment_neutral'] = 0.0
    df['sentiment_positive'] = 0.0

    for row in df.itertuples():
        sentiment = sia.polarity_scores(row.text)
        df.at[row.Index, 'sentiment_overall'] = sentiment['compound']
        df.at[row.Index, 'sentiment_negative'] = sentiment['neg']
        df.at[row.Index, 'sentiment_neutral'] = sentiment['neu']
        df.at[row.Index, 'sentiment_positive'] = sentiment['pos']

    # df.drop(columns=['Sentiment'], inplace=True) # I don't think this is necessary
    return df

def image_classification(df, api_key, api_secret):
    import pandas as pd
    import requests, json
    df_imagga = pd.DataFrame(columns=["interior objects", "nature landscape", "beaches seaside", "events parties", "food drinks",
    "paintings art", "pets animals", "text visuals", "sunrises sunsets", "cars vehicles",
    "macro flowers", "streetview architecture", "people portraits"])
    for row in df.itertuples():
        tweet_id = row.tweet_id
        if pd.isnull(row.image_url) or pd.isna(row.image_url):
            scores = [0.0] * len(df_imagga.columns)

            for n, col in enumerate(df_imagga.columns):
                # Iterate through each category of the result
                scores[n] = 0.0
                # Store the list as a new row in the DataFrame
                df_imagga.loc[tweet_id] = scores
        else:
            url = 'https://api.imagga.com/v2/categories/personal_photos/?image_url=' + row.image_url
            request = requests.get(url, auth=(api_key, api_secret))
            json_data = json.loads(request.text)

            # Create a list of 0.0 scores to update as we get data for each category we want to score in our DataFrame
            scores = [0.0] * len(df_imagga.columns)

            # Find the associated column in the DataFrame
            for n, col in enumerate(df_imagga.columns):
                # Iterate through each category of the result
                for category in json_data["result"]["categories"]:
                    if col == category['name']['en']:
                        # Store the score
                        scores[n] = category['confidence']
                        break  # No need to keep looping once we've found the score
                # Store the list as a new row in the DataFrame
                df_imagga.loc[tweet_id] = scores

    #merge the two DataFrames
    df = pd.merge(df, df_imagga, left_on=df.tweet_id, right_on=df_imagga.index)

    return df

def drop_random_columns(df):
    if 'Unnamed: 0' in df.columns:
        df.drop(columns=['Unnamed: 0'], inplace=True)
    if 'key_0' in df.columns:
        df.drop(columns=['key_0'], inplace=True)
    return df


In [15]:
api_key='acc_d2be34779581c77'
api_secret='2648aa8fc9eb5305688d66089b6856f6'


import pandas as pd
import requests, json
df_imagga = pd.DataFrame(columns=["interior objects", "nature landscape", "beaches seaside", "events parties", "food drinks",
"paintings art", "pets animals", "text visuals", "sunrises sunsets", "cars vehicles",
"macro flowers", "streetview architecture", "people portraits", 'tweet_id'])
for row in df.itertuples():
    tweet_id = row.tweet_id
    if pd.isnull(row.image_url) or pd.isna(row.image_url):
        scores = [0.0] * len(df_imagga.columns)

        for n, col in enumerate(df_imagga.columns):
            # Iterate through each category of the result
            scores[n] = 0.0
            # Store the list as a new row in the DataFrame
            df_imagga.loc[tweet_id] = scores
    else:
        url = 'https://api.imagga.com/v2/categories/personal_photos/?image_url=' + row.image_url
        request = requests.get(url, auth=(api_key, api_secret))
        json_data = json.loads(request.text)

        # Create a list of 0.0 scores to update as we get data for each category we want to score in our DataFrame
        scores = [0.0] * len(df_imagga.columns)

        # Find the associated column in the DataFrame
        for n, col in enumerate(df_imagga.columns):
            # Iterate through each category of the result
            for category in json_data["result"]["categories"]:
                if col == category['name']['en']:
                    # Store the score
                    scores[n] = category['confidence']
                    break  # No need to keep looping once we've found the score
            # Store the list as a new row in the DataFrame
            # scores.append(tweet_id)
            df_imagga.loc[n] = scores
            df_imagga = df_imagga.assign(tweet_id=tweet_id)
# print(df_imagga.columns)
# print(df.columns)
# merge the two DataFrames
# = pd.merge(df, df_imagga, left_on=df.tweet_id, right_on=df_imagga.index)
df.join(df_imagga, how = 'left', lsuffix = '_left', rsuffix = '_right')
# df_imagga.head()
print(df.columns)
print(df_imagga.columns)
# df_imagga.index.name = 'tweet_id'
df = df.merge(df_imagga, how='left')

AttributeError: 'Pandas' object has no attribute 'image_url'

In [16]:
import pandas as pd
df = pd.read_csv('./tweets.csv')

# Data cleaning and Prep
# Exclude columns it doesn't make sense to bin
df = bin_groups(df, percent=.05, cols_to_exclude=['text', 'created_at', 'image_url'])
# df = drop_columns_missing_data(df, cutoff=.50)
# df = dummy_code_categorical_variables(df)
# df = standardize_values(df)

# Engineering
df = calc_sentiment(df)
df = image_classification(df=df, api_key='acc_d2be34779581c77', api_secret='2648aa8fc9eb5305688d66089b6856f6')

# drop columns I'm finished analyzing
df = df.drop(columns=['text', 'created_at', 'image_url'])

# create dummy variables for categorical variables
# df = dummy_code_categorical_variables(df)

# impute KNN, also standardize values
df = impute_KNN(df)

df


[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\Jackson\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


Unnamed: 0,key_0,tweet_id,author_id,retweet_count,like_count,possibly_sensitive,sentiment_overall,sentiment_negative,sentiment_neutral,sentiment_positive,...,macro flowers,streetview architecture,people portraits,language_en,language_es,language_in,language_pt,source_Twitter Web App,source_Twitter for Android,source_Twitter for iPhone
0,1.000000,1.000000,8.529656e-01,0.001203,0.000000,0.0,0.283954,0.511538,0.603306,0.212,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
1,0.998266,0.998266,9.886300e-01,0.000000,0.000000,0.0,0.664719,0.000000,0.638017,0.438,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
2,0.997765,0.997765,6.627899e-01,0.000000,0.000000,0.0,0.664719,0.000000,0.638017,0.438,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
3,0.997469,0.997469,9.955527e-01,0.000000,0.000000,0.0,1.000000,0.169231,0.348760,0.700,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
4,0.994845,0.994845,9.205802e-01,0.003180,0.000000,0.0,0.555774,0.280769,0.690909,0.228,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
525,0.004469,0.004469,8.279884e-01,0.034207,0.000000,0.0,0.664719,0.000000,0.664463,0.406,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
526,0.003925,0.003925,9.511404e-01,0.000000,0.000000,0.0,0.699784,0.000000,0.519008,0.582,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
527,0.003028,0.003028,8.908156e-01,0.000000,0.003817,0.0,0.688413,0.288462,0.583471,0.354,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
528,0.001840,0.001840,9.951360e-11,0.024409,0.000000,0.0,0.555774,0.307692,0.662810,0.248,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0


In [17]:
def fs_select_linear(df, label=""):
  from sklearn.svm import LinearSVC
  from sklearn.feature_selection import SelectFromModel
  import pandas as pd

  X = df.drop(label,axis=1)
  y = df[label]

  # As C increases, more features are kept
  lsvc = LinearSVC(C=0.05, penalty="l1", dual=False).fit(X, y)
  sel = SelectFromModel(lsvc, prefit=True)
  sel.transform(X)

  columns = list(X.columns[sel.get_support()])
  columns.append(label)
  return df[columns]

def fs_selectkbest(df, k=10, label=""):
  from sklearn.feature_selection import SelectKBest, r_regression
  import pandas as pd

  X = df.drop(columns=[label])
  y = df[label]

  # Select the top k features based on a given bivariate metric
  sel = SelectKBest(r_regression, k=k)
  sel.fit_transform(X, y)
  
  return df[sel.get_feature_names_out()].join(df[label])

def fs_variance(df, label="", p=0.8):
  from sklearn.feature_selection import VarianceThreshold
  import pandas as pd

  if label != "":
    X = df.drop(columns=[label])
    
  sel = VarianceThreshold(threshold=(p * (1 - p)))
  sel.fit_transform(X)

  # Add the label back in after removing poor features
  return df[sel.get_feature_names_out()].join(df[label])

# fs_linear = fs_select_linear(df, label="retweet_count")
# fs_variance = fs_variance(df, label="retweet_count", p=.1)
# fs_selectkbest = fs_selectkbest(df, 58, label="retweet_count")




# Modeling 
(same requirements of the modeling project)
* Generate the best possible model for:
    * Regression
    * Classification
    * Clustering
* You choose which features to keep in the model model
* There is no required level of fit metric. Your task is simply to get the best fit metrics possible--not achieve a certain value. All datasets are different and there is no way to compare them

In [18]:
def algorithm_selection(df, cols_to_drop=[]):
    import pandas as pd
    import numpy as np
    from sklearn import model_selection
    from sklearn import preprocessing
    import sklearn.neural_network as nn
    from sklearn.linear_model import RidgeCV, LassoCV
    import sklearn.ensemble as se
    import sklearn.tree as tree
    from sklearn.gaussian_process.kernels import DotProduct, WhiteKernel
    from sklearn import gaussian_process
    from sklearn import neighbors
    from sklearn import svm
    import sklearn.linear_model as lm
    import pickle

    df = df.select_dtypes(np.number)  # Remove categorical features first
    y = df.like_count                    # Save the label first
    # Remove the label from the feature list
    X = df.drop(columns=cols_to_drop)

    # Scale/normalize the features
    X = pd.DataFrame(preprocessing.MinMaxScaler(
    ).fit_transform(X), columns=X.columns)

    # Split the data
    X_train, X_test, y_train, y_test = model_selection.train_test_split(
        X, y, test_size=0.3, random_state=12345)

    # Eyeball the data to make sure it looks right:
    X_train

    fit = {}  # Use this to store each of the fit metrics

    # 1. LINEAR MODELS: assumes normal distribution, homoscedasticity, no multi-collinearity, independence, and no auto-correlation (some exceptions apply)

    # 1.1. Ordinary Least Squares Multiple Linear Regression
    model_ols = lm.LinearRegression()
    model_ols.fit(X_train, y_train)
    fit['OrdinaryLS R'] = model_ols.score(X_test, y_test)

    # 1.2. Ridge Regression: more robust to multi-collinearity
    # adjust this alpha parameter for better results (between 0 and 1)
    model_rr = lm.Ridge(alpha=0.5)
    model_rr.fit(X_train, y_train)
    fit['Ridge R'] = model_rr.score(X_test, y_test)

    # 1.3. Lasso Regression: better for sparse values like RetweetCount where most are zeros but a few have many retweets.
    # adjust this alpha parameter for better results (between 0 and 1)
    model_lr = lm.Lasso(alpha=0.1)
    model_lr.fit(X_train, y_train)
    fit['Lasso R'] = model_lr.score(X_test, y_test)

    # 1.4. Least Angle Regression: good when the number of features is greater than the number of samples
    # adjust this alpha parameter for better results (between 0 and 1)
    model_llr = lm.LassoLars(alpha=0.1)
    model_llr.fit(X_train, y_train)
    fit['LARS Lasso R'] = model_llr.score(X_test, y_test)

    # 1.5. Bayesian Regression: probability based; allows regularization parameters, automatically tuned to data
    model_br = lm.BayesianRidge()
    model_br.fit(X_train, y_train)
    fit['Bayesian R'] = model_br.score(X_test, y_test)

    # SUPPORT VECTOR MACHINES
    # 1.9. SVM: this is the default SVM, parameters can be modified to make this more accurate
    model_svm = svm.SVR()
    model_svm.fit(X_train, y_train)
    fit['SupportVM R'] = model_svm.score(X_test, y_test)

    # 1.10. Linear SVM: Faster than SVM but only considers a linear model
    model_lsvm = svm.LinearSVR()
    model_lsvm.fit(X_train, y_train)
    fit['Linear SVM R'] = model_lsvm.score(X_test, y_test)

    # 1.11. NuSVM:
    model_nusvm = svm.NuSVR()
    model_nusvm.fit(X_train, y_train)
    fit['NuSupportVM R'] = model_nusvm.score(X_test, y_test)

    # STOCHASTIC GRADIENT DESCENT REGRESSION
    # 1.12. SGDRegressor:
    model_sgdr = lm.SGDRegressor()
    model_sgdr.fit(X_train, y_train)
    fit['SGradientD R'] = model_sgdr.score(X_test, y_test)

    # KNN: NEAREST NEIGHBORS REGRESSION

    # 1.13. KNeighborsRegressor:
    model_knnr = neighbors.KNeighborsRegressor(5, 'uniform')
    model_knnr.fit(X_train, y_train)
    fit['KNNeighbors R'] = model_knnr.score(X_test, y_test)

    # 1.14. KNeighborsRegressor:
    model_knnrd = neighbors.KNeighborsRegressor(8, 'distance')
    model_knnrd.fit(X_train, y_train)
    fit['KNNeighborsD R'] = model_knnrd.score(X_test, y_test)

    # GAUSSIAN PROCESS REGRESSION

    # 1.15. GaussianProcessRegressor:
    model_gpr = gaussian_process.GaussianProcessRegressor(
        DotProduct() + WhiteKernel())
    model_gpr.fit(X_train, y_train)
    fit['GaussianP R'] = model_gpr.score(X_test, y_test)

    # DECISION TREE MODELS: no assumptions about the data

    # 1.16. Decision Tree Regression
    model_dt = tree.DecisionTreeRegressor(random_state=12345)
    model_dt.fit(X_train, y_train)
    fit['Dec Tree R'] = model_dt.score(X_test, y_test)

    # DECISION TREE-BASED ENSEMBLE MODELS: great for minimizing overfitting, these are based on averaging many unique sub-samples and combining algorithms
    # 1.17. Decision Forrest
    model_df = se.RandomForestRegressor(random_state=12345)
    model_df.fit(X_train, y_train)
    fit['Dec Forest R'] = model_df.score(X_test, y_test)

    # 1.18. ExtraTreesRegressor
    model_etr = se.ExtraTreesRegressor(random_state=12345)
    model_etr.fit(X_train, y_train)
    fit['Extra Trees R'] = model_etr.score(X_test, y_test)

    # 1.19. AdaBoostRegressor
    model_abr = se.AdaBoostRegressor(n_estimators=100, random_state=12345)
    model_abr.fit(X_train, y_train)
    fit['AdaBoost DT R'] = model_abr.score(X_test, y_test)

    # 1.20. GradientBoostingRegressor
    model_gbr = se.GradientBoostingRegressor(random_state=12345)
    model_gbr.fit(X_train, y_train)
    fit['Grad. Boost R'] = model_gbr.score(X_test, y_test)

    # 1.22. VotingRegressor: will combine other algorithms into an average; kind of cool
    model_vr = se.VotingRegressor(estimators=[('DT', model_dt), ('DF', model_df), (
        'ETR', model_etr), ('ABR', model_abr), ('GBR', model_gbr)])
    model_vr.fit(X_train, y_train)
    fit['Voting R'] = model_vr.score(X_test, y_test)

    # 1.23. StackingRegressor
    estimators = [('ridge', RidgeCV()), ('lasso', LassoCV(
        random_state=42)), ('svr', svm.SVR(C=1, gamma=1e-6))]
    model_sr = se.StackingRegressor(
        estimators=estimators, final_estimator=se.GradientBoostingRegressor(random_state=12345))
    model_sr.fit(X_train, y_train)
    fit['Stacking R'] = model_sr.score(X_test, y_test)

    # NEURAL-NETWORK MODELS: Based on deep learning methods

    # 1.24. MLPRegressor
    # Turn max_iter way up or down to get a more accurate result
    model_nn = nn.MLPRegressor(max_iter=1000, random_state=12345)
    model_nn.fit(X_train, y_train)
    fit['NeuralNet R'] = model_nn.score(X_test, y_test)

    # Sort and print the dictionary by greatest R squared to least
    r2s = sorted_list_by_value = sorted(fit, key=fit.__getitem__, reverse=True)
    for r2 in r2s:
        print(f'{r2}:\t{fit[r2]}')

    # Select the model with the highest R squared
    print(f'Best model: {r2s[0]} (R2:{fit[r2s[0]]})')
    model = fit[r2s[1]]
    type(model)

    # Save the model with the highest fit metric
    pickle.dump(model, open('stored_model.sav', 'wb'))


def clustering(df):
    !pip install gower
    import gower
    from sklearn.cluster import AgglomerativeClustering

    distance_matrix = gower.gower_matrix(df)
    agg = AgglomerativeClustering(
        affinity='precomputed', linkage='average').fit(distance_matrix)

    # make a cluster column
    df_wcluster = df.copy()
    df_wcluster['cluster'] = agg.labels_
    df_wcluster.head()
    return df_wcluster


In [22]:
df_modeling = df.copy()

# algorithm_selection(df_modeling, cols_to_drop=['key_0', 'tweet_id', 'author_id', 'like_count', 'retweet_count'])

df_MLR = fit_crossvalidate_mlr(df_modeling, 10, 'like_count', True)

df_modeling = clustering(df_modeling)


Average R-squared:	-82.46309669109394


# Automation
* Dynamically select the best algorithm for your regression and classification models
* Demonstrate through the feature importance metric which features should be included in the model. However, you don't need to set up an automated selection of features. You can manually decide which features to include
* Dynamically save the best fitting model to a .sav file in the same folder as your .ipynb
* Arrange your .ipynb file so that the "Run all" command will handle all steps above in order from data collection to .sav file