# Linear regression

In [2]:
def lin_reg(Y, X):
    import numpy as np
    import statsmodels.api as sm
    X_with_constant = sm.add_constant(X)
    bf_lm = sm.OLS(Y, X_with_constant)
    bf_lm_fit = bf_lm.fit()
    summary = bf_lm_fit.summary()
    display(summary)

# Cronbach's aplha if item removed

In [1]:
# cronbach_alpha_test() -- returns just cronbach's alpha (raw alpha)
# cronbach_alpha() -- returns raw alpha plus alpha if item removed, item mean and StdDev
 
import pandas as pd

def cronbach_alpha_test(items):
    items = pd.DataFrame(items)
    items_count = items.shape[1]
    variance_sum = float(items.var(axis=0, ddof=1).sum())
    total_var = float(items.sum(axis=1).var(ddof=1))
    return (items_count / float(items_count - 1) *
            (1 - variance_sum / total_var))

def cronbach_alpha(df):
    # Print raw alpha
    alpha = round(cronbach_alpha_test(df), 2)
    print("Raw alpha: ", alpha)
    # Calc mean and StdDev of items
    mean_list = [df.mean(axis = 0)]
    std_list = [df.std(axis = 0)]
    # Calculate alpha if item removed and print final results
    a_list = []
    for n in df:
        a_list.append(n)
    for n in range(len(a_list)):
        del_item = a_list[n]
        removed_alpha_list = [x for x in a_list if x not in del_item]
        alpha_test = df[removed_alpha_list]
        a = round(cronbach_alpha_test(alpha_test), 2)
        print("Raw alpha if {} removed: ".format(del_item), "{:.2f}".format(a),
              "    Item mean: ", "{:.2f}".format(mean_list[0][n]), 
              "    Item StdDev: ", "{:.2f}".format(std_list[0][n]))
    print("")

# Gradient Boosting classifier

In [5]:
def xgb_loop(Y, X):
    import pandas as pd
    import numpy as np
    import xgboost
    from sklearn import model_selection
    from sklearn.metrics import accuracy_score
    from sklearn.preprocessing import LabelEncoder
    import statistics
    accuracy_list = []
    for item in range(20):
        test_size = 0.33
        X_train, X_test, y_train, y_test = model_selection.train_test_split(X, Y, test_size=test_size)
        # fit model to training data
        model = xgboost.XGBClassifier()
        model.fit(X_train, np.ravel(y_train))
        # make predictions for test data
        y_pred = model.predict(X_test)
        predictions = [round(value) for value in y_pred]
        # evaluate predictions
        accuracy = accuracy_score(y_test, predictions)
        accuracy_list.append(accuracy)    
    model_accuracy = statistics.mean(accuracy_list)    
    print("Accuracy:", round(model_accuracy * 100, 2), "%")

# Logistic regression

In [6]:
# StatsModels logistic regression user defined function

def logreg(y, x):
    import numpy as np
    import statsmodels.api as sm
    x = sm.add_constant(x)
    model = sm.Logit(y, x)
    result = model.fit(method='newton')
    diagonal_sum = result.pred_table().diagonal().sum()
    predtable_sum = result.pred_table().sum()
    accuracy = round(diagonal_sum / predtable_sum * 100, 3)
    print("Model accuracy: ", str(accuracy) + "%")
    print(result.summary())

# Naive Bayes

In [7]:
def my_NaiveBayes(dat, classifier):
    from sklearn import datasets
    from sklearn.model_selection import train_test_split
    from sklearn.naive_bayes import GaussianNB
    from sklearn import metrics
    import statistics
    
    gnb = GaussianNB()
    accuracy_list = []
    
    for item in range(20):
        x_train, x_test, y_train, y_test = train_test_split(dat, classifier, test_size = 0.2)  
        gnb.fit(x_train, y_train)
        y_pred = gnb.predict(x_test)
        accuracy_score = metrics.accuracy_score(y_test, y_pred)
        accuracy_list.append(accuracy_score)
    
    model_accuracy = round(statistics.mean(accuracy_list), 4)

    print("Accuracy:", model_accuracy * 100, "%")

# Sentiment analysis

In [6]:
# Condense code for barplot labels

def add_value_labels(ax, spacing=5):
    for rect in ax.patches:
        y_value = rect.get_height()
        x_value = rect.get_x() + rect.get_width() / 2
        space = spacing
        va = 'bottom'
        if y_value < 0:
            space *= -1
            va = 'top'
        label = "{:.2f}".format(y_value)
        ax.annotate(
            label,                      
            (x_value, y_value),         
            xytext=(0, space),          
            textcoords="offset points", 
            ha='center',                
            va=va)  

def analyse_corpus_sentiment(text):
    import spacy
    from vaderSentiment import vaderSentiment
    import numpy as np
    import pandas as pd
    import matplotlib.pyplot as plt
    english = spacy.load("en_core_web_sm")
    
    negative = []
    neutral = []
    positive = []
    compound = []
    
    for t in range(len(text)):
        result = english(text[t])
        sentences = [str(s) for s in result.sents]
        analyzer = vaderSentiment.SentimentIntensityAnalyzer()
        sentiment = [analyzer.polarity_scores(str(s)) for s in sentences]
        
        negative.append(sentiment[0]['neg'])
        neutral.append(sentiment[0]['neu'])
        positive.append(sentiment[0]['pos'])
        compound.append(sentiment[0]['compound'])
    
    sentiment_score = pd.DataFrame()
    sentiment_score['Negative'] = [round(np.mean(negative), 2)]
    sentiment_score['Neutral'] = [round(np.mean(neutral), 2)]
    sentiment_score['Positive'] = [round(np.mean(positive), 2)]
    sentiment_score['Compound'] = [round(np.mean(compound), 2)]
    
    ax = sentiment_score.plot(kind = 'bar')
    ax.set_xlabel("Sentiment")
    plt.grid(True)
    plt.ylim([0, 1])
    add_value_labels(ax)

In [1]:
# Score as a list

def analyse_corpus_sentiment(text):
    import spacy
    from vaderSentiment import vaderSentiment
    import numpy as np
    import pandas as pd
    english = spacy.load("en_core_web_sm")
    
    negative = []
    neutral = []
    positive = []
    compound = []
    
    for t in range(len(text)):
        result = english(text[t])
        sentences = [str(s) for s in result.sents]
        analyzer = vaderSentiment.SentimentIntensityAnalyzer()
        sentiment = [analyzer.polarity_scores(str(s)) for s in sentences]
        
        negative.append(sentiment[0]['neg'])
        neutral.append(sentiment[0]['neu'])
        positive.append(sentiment[0]['pos'])
        compound.append(sentiment[0]['compound'])
    
    sentiment_score = pd.DataFrame()
    sentiment_score['Negative'] = [round(np.mean(negative), 2)]
    sentiment_score['Neutral'] = [round(np.mean(neutral), 2)]
    sentiment_score['Positive'] = [round(np.mean(positive), 2)]
    sentiment_score['Compound'] = [round(np.mean(compound), 2)]
    
    display(sentiment_score)