In [57]:
# Import Python packages
import openai
import os
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import random
import seaborn as sns
import time
import itertools

import nltk
from nltk.corpus import stopwords

import sklearn
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import classification_report, accuracy_score
from sklearn.naive_bayes import MultinomialNB

In [58]:
# Lemmatize words in the response
def lemmatize_text(text):
    return [lemmatizer.lemmatize(w) for w in w_tokenizer.tokenize(text.lower())]

# Initialize the lemmatizer and whitespace tokenizer
w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
lemmatizer = nltk.stem.WordNetLemmatizer()

stop_words = stopwords.words('english')

In [59]:
# Read the .csv file containing the original ChatGPT responses
my_anns = pd.read_csv("../data/modelData.csv", encoding='cp1252')

In [61]:
# Perform some standardization tasks on the ChatGPT responses
import re
my_anns = my_anns[my_anns["Residential & Commerial Development"].notna()]

my_anns["Response"] = [re.sub('[^a-zA-Z ]+', '', s) for s in my_anns["Response"]]
my_anns["Response"] = my_anns["Response"].str.replace('\d*','')
my_anns["Response"] = my_anns["Response"].str.replace('w{3}','')
my_anns["Response"] = my_anns["Response"].str.replace('\s+', ' ')
my_anns["Response"] = my_anns["Response"].str.replace(r'\s+[a-zA-Z]\s+', '')
my_anns["tokenizedResponse"] = my_anns["Response"].apply(lemmatize_text)
my_anns["tokenizedResponse"] = my_anns["tokenizedResponse"].apply(lambda x: [word for word in x if word not in stop_words])
trained_features = my_anns["tokenizedResponse"]

trained_features = itertools.chain.from_iterable(trained_features)

trained_features = sorted(set(trained_features), key = lambda s:s.lower())

my_anns["tokenizedResponse"] = [" ".join(t) for t in my_anns["tokenizedResponse"]]
my_anns = my_anns[my_anns["Residential & Commerial Development"].notna()]

def dummy_fun(doc):
    return doc

vectorizer = CountVectorizer()
vectorizedResponse = vectorizer.fit_transform(my_anns["tokenizedResponse"]).toarray()

## Naive Bayes Classifiers ##

In [62]:
t1_data = []
t2_data = []
t3_data = []
t4_data = []
t5_data = []
t6_data = []
t7_data = []
t8_data = []
t9_data = []
t10_data = []
t11_data = []
t12_data = []
t13_data = []

t1_fit = []
t2_fit = []
t3_fit = []
t4_fit = []
t5_fit = []
t6_fit = []
t7_fit = []
t8_fit = []
t9_fit = []
t10_fit = []
t11_fit = []
t12_fit = []
t13_fit = []

t1_f1 = []
t2_f1 = []
t3_f1 = []
t4_f1 = []
t5_f1 = []
t6_f1 = []
t7_f1 = []
t8_f1 = []
t9_f1 = []
t10_f1 = []
t11_f1 = []
t12_f1 = []
t13_f1 = []

for i in range(1000):
    # T1 - Residential and Commercial Development
    x_train, x_val, y_train, y_val = train_test_split(vectorizedResponse, my_anns["Residential & Commerial Development"], test_size = 0.20, stratify = my_anns["Residential & Commerial Development"])
    t1_data.append(sum(x_train))
    y_train = [str(y) for y in y_train]
    y_val = [str(y) for y in y_val]

    t1_classifier = MultinomialNB()
    t1_classifier.fit(x_train, y_train)
    y_pred = t1_classifier.predict(x_val)

    t1_fit.append(t1_classifier)
    t1_f1.append(sklearn.metrics.f1_score(y_val, y_pred, pos_label="True"))

    # T2 - Agricultural and Aquacultural Development
    x_train, x_val, y_train, y_val = train_test_split(vectorizedResponse, my_anns["Agriculture & Aquaculture"], test_size = 0.20, stratify = my_anns["Agriculture & Aquaculture"])
    t2_data.append(sum(x_train))
    y_train = [str(y) for y in y_train]
    y_val = [str(y) for y in y_val]

    t2_classifier = MultinomialNB()
    t2_classifier.fit(x_train, y_train)
    y_pred = t2_classifier.predict(x_val)

    t2_fit.append(t2_classifier)
    t2_f1.append(sklearn.metrics.f1_score(y_val, y_pred, pos_label="True"))

    # T3 - Energy Production and Mining
    x_train, x_val, y_train, y_val = train_test_split(vectorizedResponse, my_anns["Energy Producting & Mining"], test_size = 0.20, stratify = my_anns["Energy Producting & Mining"])
    t3_data.append(sum(x_train))
    y_train = [str(y) for y in y_train]
    y_val = [str(y) for y in y_val]

    t3_classifier = MultinomialNB()
    t3_classifier.fit(x_train, y_train)
    y_pred = t3_classifier.predict(x_val)

    t3_fit.append(t3_classifier)
    t3_f1.append(sklearn.metrics.f1_score(y_val, y_pred, pos_label="True"))

    # T4 - Transportation & Service Corridors
    x_train, x_val, y_train, y_val = train_test_split(vectorizedResponse, my_anns["Transportation & Service Corridors"], test_size = 0.20, stratify = my_anns["Transportation & Service Corridors"])
    t4_data.append(sum(x_train))
    y_train = [str(y) for y in y_train]
    y_val = [str(y) for y in y_val]

    t4_classifier = MultinomialNB()
    t4_classifier.fit(x_train, y_train)
    y_pred = t4_classifier.predict(x_val)

    t4_fit.append(t4_classifier)
    t4_f1.append(sklearn.metrics.f1_score(y_val, y_pred, pos_label="True"))

    # T5 - Biological Resource Use
    x_train, x_val, y_train, y_val = train_test_split(vectorizedResponse, my_anns["Biological Resource Use"], test_size = 0.20, stratify = my_anns["Biological Resource Use"])
    t5_data.append(sum(x_train))
    y_train = [str(y) for y in y_train]
    y_val = [str(y) for y in y_val]

    t5_classifier = MultinomialNB()
    t5_classifier.fit(x_train, y_train)
    y_pred = t5_classifier.predict(x_val)

    t5_fit.append(t5_classifier)
    t5_f1.append(sklearn.metrics.f1_score(y_val, y_pred, pos_label="True"))

    # T6 - Human Intrusions & Disturbance
    x_train, x_val, y_train, y_val = train_test_split(vectorizedResponse, my_anns["Human Intrusions & Disturbance"], test_size = 0.20, stratify = my_anns["Human Intrusions & Disturbance"])
    t6_data.append(sum(x_train))
    y_train = [str(y) for y in y_train]
    y_val = [str(y) for y in y_val]

    t6_classifier = MultinomialNB()
    t6_classifier.fit(x_train, y_train)
    y_pred = t6_classifier.predict(x_val)

    t6_fit.append(t6_classifier)
    t6_f1.append(sklearn.metrics.f1_score(y_val, y_pred, pos_label="True"))

    # T7 - Natural System Modifcations
    x_train, x_val, y_train, y_val = train_test_split(vectorizedResponse, my_anns["Natural System Modifcations"], test_size = 0.20, stratify = my_anns["Natural System Modifcations"])
    t7_data.append(sum(x_train))
    y_train = [str(y) for y in y_train]
    y_val = [str(y) for y in y_val]

    t7_classifier = MultinomialNB()
    t7_classifier.fit(x_train, y_train)
    y_pred = t7_classifier.predict(x_val)

    t7_fit.append(t7_classifier)
    t7_f1.append(sklearn.metrics.f1_score(y_val, y_pred, pos_label="True"))

    # T8 - Invasive & Other Problematic Species & Genes
    x_train, x_val, y_train, y_val = train_test_split(vectorizedResponse, my_anns["Invasive & Other Problematic Species & Genes"], test_size = 0.20, stratify = my_anns["Invasive & Other Problematic Species & Genes"])
    t8_data.append(sum(x_train))
    y_train = [str(y) for y in y_train]
    y_val = [str(y) for y in y_val]

    t8_classifier = MultinomialNB()
    t8_classifier.fit(x_train, y_train)
    y_pred = t8_classifier.predict(x_val)

    t8_fit.append(t8_classifier)
    t8_f1.append(sklearn.metrics.f1_score(y_val, y_pred, pos_label="True"))

    # T9 - Pollution
    x_train, x_val, y_train, y_val = train_test_split(vectorizedResponse, my_anns["Pollution"], test_size = 0.20, stratify = my_anns["Pollution"])
    t9_data.append(sum(x_train))
    y_train = [str(y) for y in y_train]
    y_val = [str(y) for y in y_val]

    t9_classifier = MultinomialNB()
    t9_classifier.fit(x_train, y_train)
    y_pred = t9_classifier.predict(x_val)

    t9_fit.append(t9_classifier)
    t9_f1.append(sklearn.metrics.f1_score(y_val, y_pred, pos_label="True"))

    # T10 - Geological Events
    x_train, x_val, y_train, y_val = train_test_split(vectorizedResponse, my_anns["Geological Events"], test_size = 0.20, stratify = my_anns["Geological Events"])
    t10_data.append(sum(x_train))
    y_train = [str(y) for y in y_train]
    y_val = [str(y) for y in y_val]

    t10_classifier = MultinomialNB()
    t10_classifier.fit(x_train, y_train)
    y_pred = t10_classifier.predict(x_val)

    t10_fit.append(t10_classifier)
    t10_f1.append(sklearn.metrics.f1_score(y_val, y_pred, pos_label="True"))

    # T11 - Climate Change & Severe Weather
    x_train, x_val, y_train, y_val = train_test_split(vectorizedResponse, my_anns["Climate Change & Severe Weather"], test_size = 0.20, stratify = my_anns["Climate Change & Severe Weather"])
    t11_data.append(sum(x_train))
    y_train = [str(y) for y in y_train]
    y_val = [str(y) for y in y_val]

    t11_classifier = MultinomialNB()
    t11_classifier.fit(x_train, y_train)
    y_pred = t11_classifier.predict(x_val)

    t11_fit.append(t11_classifier)
    t11_f1.append(sklearn.metrics.f1_score(y_val, y_pred, pos_label="True"))

    # T12 - Limiting/Intrinsic Population Factors
    x_train, x_val, y_train, y_val = train_test_split(vectorizedResponse, my_anns["Limiting/Intrinsic Population Factors"], test_size = 0.20, stratify = my_anns["Limiting/Intrinsic Population Factors"])
    t12_data.append(sum(x_train))
    y_train = [str(y) for y in y_train]
    y_val = [str(y) for y in y_val]

    t12_classifier = MultinomialNB()
    t12_classifier.fit(x_train, y_train)
    y_pred = t12_classifier.predict(x_val)

    t12_fit.append(t12_classifier)
    t12_f1.append(sklearn.metrics.f1_score(y_val, y_pred, pos_label="True"))

    # T13 - DISEASE?
    x_train, x_val, y_train, y_val = train_test_split(vectorizedResponse, my_anns["DISEASE?"], test_size = 0.20, stratify = my_anns["DISEASE?"])
    t13_data.append(sum(x_train))
    y_train = [str(y) for y in y_train]
    y_val = [str(y) for y in y_val]

    t13_classifier = MultinomialNB()
    t13_classifier.fit(x_train, y_train)
    y_pred = t13_classifier.predict(x_val)

    t13_fit.append(t13_classifier)
    t13_f1.append(sklearn.metrics.f1_score(y_val, y_pred, pos_label="True"))

scores_dict = {"t1_f1": t1_f1, "t2_f1": t2_f1, "t3_f1": t3_f1,
               "t4_f1": t4_f1, "t5_f1": t5_f1, "t6_f1": t6_f1,
               "t7_f1": t7_f1, "t8_f1": t8_f1, "t9_f1": t9_f1,
               "t10_f1": t10_f1, "t11_f1": t11_f1, "t12_f1": t12_f1,
               "t13_f1": t13_f1}
scores_df = pd.DataFrame(scores_dict)
scores_df.to_csv("../data/f1_scores.csv")

In [63]:
print(t1_data[0])

[ 1 16  1 ...  3  0  1]


In [47]:
# Select models with over 0.7 F1 score from each model list
t1_fit_idx = [i for i,v in enumerate(t1_f1) if v > 0.8]
t2_fit_idx = [i for i,v in enumerate(t2_f1) if v > 0.8]
t3_fit_idx = [i for i,v in enumerate(t3_f1) if v > 0.8]
t4_fit_idx = [i for i,v in enumerate(t4_f1) if v > 0.8]
t5_fit_idx = [i for i,v in enumerate(t5_f1) if v > 0.8]
t6_fit_idx = [i for i,v in enumerate(t6_f1) if v > 0.8]
t7_fit_idx = [i for i,v in enumerate(t7_f1) if v > 0.8]
t8_fit_idx = [i for i,v in enumerate(t8_f1) if v > 0.8]
t9_fit_idx = [i for i,v in enumerate(t9_f1) if v > 0.8]
t10_fit_idx = [i for i,v in enumerate(t10_f1) if v > 0.8]
t11_fit_idx = [i for i,v in enumerate(t11_f1) if v > 0.8]
t12_fit_idx = [i for i,v in enumerate(t12_f1) if v > 0.8]
t13_fit_idx = [i for i,v in enumerate(t13_f1) if v > 0.8]

# Remove those indices for which the training data was duplicated
count_dict = {}
for idx, lst in enumerate(t1_data):
    lst_tuple = tuple(lst)
    if lst_tuple in count_dict:
        count_dict[lst_tuple].append(idx)
    else:
        count_dict[lst_tuple] = [idx]
t1_dupe_idx = [idx for idx in count_dict.values() if len(idx) > 1]
t1_dupe_idx = [idx for sub in t1_dupe_idx for idx in sub]

count_dict = {}
for idx, lst in enumerate(t2_data):
    lst_tuple = tuple(lst)
    if lst_tuple in count_dict:
        count_dict[lst_tuple].append(idx)
    else:
        count_dict[lst_tuple] = [idx]
t2_dupe_idx = [idx for idx in count_dict.values() if len(idx) > 1]
t2_dupe_idx = [idx for sub in t2_dupe_idx for idx in sub]
        
count_dict = {}
for idx, lst in enumerate(t3_data):
    lst_tuple = tuple(lst)
    if lst_tuple in count_dict:
        count_dict[lst_tuple].append(idx)
    else:
        count_dict[lst_tuple] = [idx]
t3_dupe_idx = [idx for idx in count_dict.values() if len(idx) > 1]
t3_dupe_idx = [idx for sub in t3_dupe_idx for idx in sub]

count_dict = {}
for idx, lst in enumerate(t4_data):
    lst_tuple = tuple(lst)
    if lst_tuple in count_dict:
        count_dict[lst_tuple].append(idx)
    else:
        count_dict[lst_tuple] = [idx]
t4_dupe_idx = [idx for idx in count_dict.values() if len(idx) > 1]
t4_dupe_idx = [idx for sub in t4_dupe_idx for idx in sub]

count_dict = {}
for idx, lst in enumerate(t5_data):
    lst_tuple = tuple(lst)
    if lst_tuple in count_dict:
        count_dict[lst_tuple].append(idx)
    else:
        count_dict[lst_tuple] = [idx]
t5_dupe_idx = [idx for idx in count_dict.values() if len(idx) > 1]
t5_dupe_idx = [idx for sub in t5_dupe_idx for idx in sub]

count_dict = {}
for idx, lst in enumerate(t6_data):
    lst_tuple = tuple(lst)
    if lst_tuple in count_dict:
        count_dict[lst_tuple].append(idx)
    else:
        count_dict[lst_tuple] = [idx]
t6_dupe_idx = [idx for idx in count_dict.values() if len(idx) > 1]
t6_dupe_idx = [idx for sub in t6_dupe_idx for idx in sub]

count_dict = {}
for idx, lst in enumerate(t7_data):
    lst_tuple = tuple(lst)
    if lst_tuple in count_dict:
        count_dict[lst_tuple].append(idx)
    else:
        count_dict[lst_tuple] = [idx]
t7_dupe_idx = [idx for idx in count_dict.values() if len(idx) > 1]
t7_dupe_idx = [idx for sub in t7_dupe_idx for idx in sub]

count_dict = {}
for idx, lst in enumerate(t8_data):
    lst_tuple = tuple(lst)
    if lst_tuple in count_dict:
        count_dict[lst_tuple].append(idx)
    else:
        count_dict[lst_tuple] = [idx]
t8_dupe_idx = [idx for idx in count_dict.values() if len(idx) > 1]
t8_dupe_idx = [idx for sub in t8_dupe_idx for idx in sub]

count_dict = {}
for idx, lst in enumerate(t9_data):
    lst_tuple = tuple(lst)
    if lst_tuple in count_dict:
        count_dict[lst_tuple].append(idx)
    else:
        count_dict[lst_tuple] = [idx]
t9_dupe_idx = [idx for idx in count_dict.values() if len(idx) > 1]
t9_dupe_idx = [idx for sub in t9_dupe_idx for idx in sub]

count_dict = {}
for idx, lst in enumerate(t10_data):
    lst_tuple = tuple(lst)
    if lst_tuple in count_dict:
        count_dict[lst_tuple].append(idx)
    else:
        count_dict[lst_tuple] = [idx]
t10_dupe_idx = [idx for idx in count_dict.values() if len(idx) > 1]
t10_dupe_idx = [idx for sub in t10_dupe_idx for idx in sub]

count_dict = {}
for idx, lst in enumerate(t11_data):
    lst_tuple = tuple(lst)
    if lst_tuple in count_dict:
        count_dict[lst_tuple].append(idx)
    else:
        count_dict[lst_tuple] = [idx]
t11_dupe_idx = [idx for idx in count_dict.values() if len(idx) > 1]
t11_dupe_idx = [idx for sub in t11_dupe_idx for idx in sub]

count_dict = {}
for idx, lst in enumerate(t12_data):
    lst_tuple = tuple(lst)
    if lst_tuple in count_dict:
        count_dict[lst_tuple].append(idx)
    else:
        count_dict[lst_tuple] = [idx]
t12_dupe_idx = [idx for idx in count_dict.values() if len(idx) > 1]
t12_dupe_idx = [idx for sub in t12_dupe_idx for idx in sub]

count_dict = {}
for idx, lst in enumerate(t13_data):
    lst_tuple = tuple(lst)
    if lst_tuple in count_dict:
        count_dict[lst_tuple].append(idx)
    else:
        count_dict[lst_tuple] = [idx]
t13_dupe_idx = [idx for idx in count_dict.values() if len(idx) > 1]
t13_dupe_idx = [idx for sub in t13_dupe_idx for idx in sub]


In [64]:
# Calculate probabilistically
t1_preds = {}
for i in range(len(t1_fit_idx)):
    t1_preds[i] = t1_fit[i].predict(vectorizedResponse)
t1_preds = pd.DataFrame.from_dict(t1_preds)

t2_preds = {}
for i in range(len(t2_fit_idx)):
    t2_preds[i] = t2_fit[i].predict(vectorizedResponse)
t2_preds = pd.DataFrame.from_dict(t2_preds)

t3_preds = {}
for i in range(len(t3_fit_idx)):
    t3_preds[i] = t3_fit[i].predict(vectorizedResponse)
t3_preds = pd.DataFrame.from_dict(t3_preds)

t4_preds = {}
for i in range(len(t4_fit_idx)):
    t4_preds[i] = t4_fit[i].predict(vectorizedResponse)
t4_preds = pd.DataFrame.from_dict(t4_preds)

t5_preds = {}
for i in range(len(t5_fit_idx)):
    t5_preds[i] = t5_fit[i].predict(vectorizedResponse)
t5_preds = pd.DataFrame.from_dict(t5_preds)

t6_preds = {}
for i in range(len(t6_fit_idx)):
    t6_preds[i] = t6_fit[i].predict(vectorizedResponse)
t6_preds = pd.DataFrame.from_dict(t6_preds)

t7_preds = {}
for i in range(len(t7_fit_idx)):
    t7_preds[i] = t7_fit[i].predict(vectorizedResponse)
t7_preds = pd.DataFrame.from_dict(t7_preds)

t8_preds = {}
for i in range(len(t8_fit_idx)):
    t8_preds[i] = t8_fit[i].predict(vectorizedResponse)
t8_preds = pd.DataFrame.from_dict(t8_preds)

t9_preds = {}
for i in range(len(t9_fit_idx)):
    t9_preds[i] = t9_fit[i].predict(vectorizedResponse)
t9_preds = pd.DataFrame.from_dict(t9_preds)

t10_preds = {}
for i in range(len(t10_fit_idx)):
    t10_preds[i] = t10_fit[i].predict(vectorizedResponse)
t10_preds = pd.DataFrame.from_dict(t10_preds)

t11_preds = {}
for i in range(len(t11_fit_idx)):
    t11_preds[i] = t11_fit[i].predict(vectorizedResponse)
t11_preds = pd.DataFrame.from_dict(t1_preds)

t12_preds = {}
for i in range(len(t12_fit_idx)):
    t12_preds[i] = t12_fit[i].predict(vectorizedResponse)
t12_preds = pd.DataFrame.from_dict(t12_preds)

t13_preds = {}
for i in range(len(t13_fit_idx)):
    t13_preds[i] = t13_fit[i].predict(vectorizedResponse)
t13_preds = pd.DataFrame.from_dict(t13_preds)



In [65]:
# Write the prediction matrices to a csv file
t1_preds.to_csv("../data/preds/t1_preds.csv")
t2_preds.to_csv("../data/preds/t2_preds.csv")
t3_preds.to_csv("../data/preds/t3_preds.csv")
t4_preds.to_csv("../data/preds/t4_preds.csv")
t5_preds.to_csv("../data/preds/t5_preds.csv")
t6_preds.to_csv("../data/preds/t6_preds.csv")
t7_preds.to_csv("../data/preds/t7_preds.csv")
t8_preds.to_csv("../data/preds/t8_preds.csv")
t9_preds.to_csv("../data/preds/t9_preds.csv")
t10_preds.to_csv("../data/preds/t10_preds.csv")
t11_preds.to_csv("../data/preds/t11_preds.csv")
t12_preds.to_csv("../data/preds/t12_preds.csv")
t13_preds.to_csv("../data/preds/t13_preds.csv")

In [66]:
# Apply the models to the whole response dataset
# Read the .csv file containing the original ChatGPT responses
my_anns = pd.read_csv("../data/subsample_gpt_responses_parseReady.csv")

# Perform some standardization tasks on the ChatGPT responses
my_anns["Response"] = [re.sub('[^a-zA-Z ]+', '', s) for s in my_anns["Response"]]
my_anns["Response"] = my_anns["Response"].str.replace('\d*','')
my_anns["Response"] = my_anns["Response"].str.replace('w{3}','')
my_anns["Response"] = my_anns["Response"].str.replace('\s+', ' ')
my_anns["Response"] = my_anns["Response"].str.replace(r'\s+[a-zA-Z]\s+', '')
my_anns["tokenizedResponse"] = my_anns["Response"].apply(lemmatize_text)
my_anns["tokenizedResponse"] = my_anns["tokenizedResponse"].apply(lambda x: [word for word in x if word not in stop_words and word in trained_features])
my_anns["tokenizedResponse"] = [" ".join(t) for t in my_anns["tokenizedResponse"]]

def dummy_fun(doc):
    return doc

vectorizer = CountVectorizer()
vectorizedResponse = vectorizer.fit_transform(my_anns["tokenizedResponse"]).toarray()

In [67]:
# Perform the predictions
t1_preds = {}
for i in range(len(t1_fit_idx)):
    t1_preds[i] = t1_fit[i].predict(vectorizedResponse)
t1_preds = pd.DataFrame.from_dict(t1_preds)

t2_preds = {}
for i in range(len(t2_fit_idx)):
    t2_preds[i] = t2_fit[i].predict(vectorizedResponse)
t2_preds = pd.DataFrame.from_dict(t2_preds)

t3_preds = {}
for i in range(len(t3_fit_idx)):
    t3_preds[i] = t3_fit[i].predict(vectorizedResponse)
t3_preds = pd.DataFrame.from_dict(t3_preds)

t4_preds = {}
for i in range(len(t4_fit_idx)):
    t4_preds[i] = t4_fit[i].predict(vectorizedResponse)
t4_preds = pd.DataFrame.from_dict(t4_preds)

t5_preds = {}
for i in range(len(t5_fit_idx)):
    t5_preds[i] = t5_fit[i].predict(vectorizedResponse)
t5_preds = pd.DataFrame.from_dict(t5_preds)

t6_preds = {}
for i in range(len(t6_fit_idx)):
    t6_preds[i] = t6_fit[i].predict(vectorizedResponse)
t6_preds = pd.DataFrame.from_dict(t6_preds)

t7_preds = {}
for i in range(len(t7_fit_idx)):
    t7_preds[i] = t7_fit[i].predict(vectorizedResponse)
t7_preds = pd.DataFrame.from_dict(t7_preds)

t8_preds = {}
for i in range(len(t8_fit_idx)):
    t8_preds[i] = t8_fit[i].predict(vectorizedResponse)
t8_preds = pd.DataFrame.from_dict(t8_preds)

t9_preds = {}
for i in range(len(t9_fit_idx)):
    t9_preds[i] = t9_fit[i].predict(vectorizedResponse)
t9_preds = pd.DataFrame.from_dict(t9_preds)

t10_preds = {}
for i in range(len(t10_fit_idx)):
    t10_preds[i] = t10_fit[i].predict(vectorizedResponse)
t10_preds = pd.DataFrame.from_dict(t10_preds)

t11_preds = {}
for i in range(len(t11_fit_idx)):
    t11_preds[i] = t11_fit[i].predict(vectorizedResponse)
t11_preds = pd.DataFrame.from_dict(t1_preds)

t12_preds = {}
for i in range(len(t12_fit_idx)):
    t12_preds[i] = t12_fit[i].predict(vectorizedResponse)
t12_preds = pd.DataFrame.from_dict(t12_preds)

t13_preds = {}
for i in range(len(t13_fit_idx)):
    t13_preds[i] = t13_fit[i].predict(vectorizedResponse)
t13_preds = pd.DataFrame.from_dict(t13_preds)


In [68]:
# Write the prediction matrices to a csv file
t1_preds.to_csv("../data/preds/t1_preds_all.csv")
t2_preds.to_csv("../data/preds/t2_preds_all.csv")
t3_preds.to_csv("../data/preds/t3_preds_all.csv")
t4_preds.to_csv("../data/preds/t4_preds_all.csv")
t5_preds.to_csv("../data/preds/t5_preds_all.csv")
t6_preds.to_csv("../data/preds/t6_preds_all.csv")
t7_preds.to_csv("../data/preds/t7_preds_all.csv")
t8_preds.to_csv("../data/preds/t8_preds_all.csv")
t9_preds.to_csv("../data/preds/t9_preds_all.csv")
t10_preds.to_csv("../data/preds/t10_preds_all.csv")
t11_preds.to_csv("../data/preds/t11_preds_all.csv")
t12_preds.to_csv("../data/preds/t12_preds_all.csv")
t13_preds.to_csv("../data/preds/t13_preds_all.csv")

In [69]:
# Save off the final annotation data
my_anns.to_csv("../data/preds/final_anns.csv")