# Data Importing

In [1]:
# Importing modules
import pandas as pd
import numpy as np
import scipy.stats as stats
import statsmodels.api as sm

from sklearn.linear_model import LinearRegression

# Read data into papers
data = pd.read_csv("/Users/atchoo/Google Drive/Research Assistant/Work With David Knight/Survey of the Incarcerated/TMPPoliticalSurveyFULL_ForDavid.csv", encoding = "ISO-8859-1")

# Print head
# print(data.columns)

In [2]:
# Insert an id index for future work
data["Survey_ID"] = data.index + 1

# Remove the columns
str_data = data[["Survey_ID", "explain_politics_changed_since_incarcerated",\
                 "explain_race_affects_politics", 'identifies_as_black', 'identifies_as_white',
                 "identifies_as_native", 'identifies_as_asian', 
                 'identifies_as_hawaiian_or_pac_islander', 'identifies_as_other_race',
                 'identifies_as_not_sure_of_race', 'identifies_as_hispanic_or_latinx',
                 "age", "gender"]]
# Print out the first rows of papers
str_data.columns = ["Survey_ID", "p_change", "r_effect", "black", "white", "native", "asian",
                   "hawaiian", "other_race", "unsure_race", "latinx", "age", "gender"]
str_data = str_data.dropna()

# Make Remove Stopword Functions

In [3]:
# Load the regular expression library and the nltk word library
import re
import nltk

# Create a function to remove nonsense words
words = set(nltk.corpus.words.words())
def clean_sent(sent):
    return " ".join(w for w in nltk.wordpunct_tokenize(sent) \
     if w.lower() in words or not w.isalpha())

strv_list = ["p_change", "r_effect"]
for strv in strv_list:
    # Remove punctuation & Convert the titles to lowercase
    str_data[strv] = str_data[strv].map(lambda x: re.sub(r'[^\w\s]', "", x)).map(lambda x: x.lower())
    # Remove nonsense words
    str_data[strv] = str_data[strv].apply(clean_sent)
    # Transform blank cells to NaN & Drop NaN
    str_data = str_data.replace(r'^\s*$', np.nan, regex=True).dropna()

In [4]:
str_data.head()

Unnamed: 0,Survey_ID,p_change,r_effect,black,white,native,asian,hawaiian,other_race,unsure_race,latinx,age,gender
0,1,no,not at all not racist all of the human race is...,False,True,False,False,False,False,False,False,46-55,Man
2,3,i feel that the parole board the some of the p...,not sure about that,False,True,False,False,False,False,False,False,46-55,Woman
3,4,yes bit,i am not really sure about that one,False,True,False,False,False,False,False,False,36-45,Woman
5,6,most are full of,i do not understand this question maybe theres...,True,False,False,False,False,False,False,False,26-35,Woman
6,7,i never considered the prison population or pr...,i try to vote based on the but i am and many i...,False,True,False,False,False,False,False,False,46-55,Woman


In [5]:
# remove stop words
# nltk.download('stopwords')
from nltk.corpus import stopwords
from gensim.utils import simple_preprocess

def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

# Include the Topic info

In [11]:
def add_the_matrix(extrastopword, csvfile, tlist=["topic 5"]):
    
    assert isinstance(extrastopword, list), "Should be a list of the extra stop words."
    assert isinstance(csvfile, str), "Should be the path of the matrix file."
    
    # change the stop words
    stop_words = stopwords.words("english")
    stop_words.extend(["don", "people", "bill", "step", "act", "first", "u", "n", 
                       "na", "non"]+extrastopword)
    yesnno = ["yes", "no"]
    stop_words =  [word for word in stop_words if word not in yesnno]
    # remove stop words
    r_effect = [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in str_data["r_effect"]]
    r_effect = pd.DataFrame([x for x in r_effect if x])
    
    # create index variable for whether a row is left
    r_effect2 = []
    for doc in str_data["r_effect"]:
        if all(word in stop_words for word in simple_preprocess(doc)):
               i_str = False
        else: 
               i_str = True
        r_effect2.append(i_str)
    inBTM = pd.DataFrame(r_effect2, columns=["inBTM"])
    # filter str_data based on inBTM result into str_datar
    str_datar = pd.concat([str_data.reset_index(drop=True), inBTM], axis=1)
    str_datar = str_datar[str_datar["inBTM"]==True]
    
    # read in the topicmatrix file
    matrix = pd.read_csv(csvfile, 
                         names=["topic 1", "topic 2", "topic 3", "topic 4", "topic 5", "topic 6"])
    a = matrix.idxmax(axis=1)
    b = matrix.T.apply(lambda x: x.nlargest(2).idxmin())
    c = matrix.T.apply(lambda x: x.nlargest(3).idxmin())
    topic = pd.concat([a, b, c], axis=1, ignore_index=False)

    # concatenate topic matrix and str_datar as str_datar
    str_datar2 = pd.concat([str_datar.reset_index(drop=True), topic], axis=1)
    str_datar2 = str_datar2.rename(columns={0:"fsttopic", 1:"sndtopic", 2:"trdtopic"})

    # CREATE THE CHISQUARE DATAFRAME
    race = ["black", "white", "native", "asian", "hawaiian", "other_race", "unsure_race", "latinx"]
    topic = ["fsttopic", "sndtopic", "trdtopic"]
    chisq = str_datar2.loc[:, ["Survey_ID"]+race+topic].dropna()
        ## clean the race variable a little bit
    chisq.loc[:, race] = chisq.loc[:, race].astype(int)
    chisq["Race"] = chisq.loc[:, race].idxmax(axis=1)
    
    # CREATE THE REGRESSION DATAFRAME
    chisq["fst5"] = chisq["fsttopic"].isin(tlist)
    chisq["snd5"] = chisq["sndtopic"].isin(tlist)
    chisq["trd5"] = chisq["trdtopic"].isin(tlist)
    cor = chisq.drop(["fsttopic", "sndtopic", "trdtopic", "Race"], axis=1)
    cov = str_data[["Survey_ID", "age", "gender"]]
    cov = pd.get_dummies(cov, columns=["age", "gender"])
    lrdata = cor.merge(cov, on="Survey_ID")
    
    return lrdata

In [12]:
common_csv = "~/Google Drive/Research Assistant/Work With David Knight/Survey of the Incarcerated/BTM/BTM_Regression_Exploration/"
extra1 = []
csv1 = "All_3095/topicmatrix.csv"

extra2 = ["violent", "race"]
csv2 = "NoViloentRace_3082/topicmatrix_3082.csv"

extra3 = ["violent", "race", "white", "black"]
csv3 = "NoViloentRaceWhiteBlack_3053/topicmatrix_3053.csv"

In [13]:
# All_3095
lrdata4 = add_the_matrix(extra1, common_csv+csv1)
# NoViloentRace_3082
lrdata5 = add_the_matrix(extra2, common_csv+csv2)
# NoViloentRaceWhiteBlack_3053
# Most Important One
lrdata6 = add_the_matrix(extra3, common_csv+csv3)

### Clear out all the white samples

In [16]:
def clear_some(dataset):
    dataset_post = dataset[dataset["white"]!=1].drop(["white"], axis=1)
    
    return dataset_post

In [17]:
# All_3095
lrdata4 = clear_some(lrdata4)
# NoViloentRace_3082
lrdata5 = clear_some(lrdata5)
# NoViloentRaceWhiteBlack_3053
# Most Important One
lrdata6 = clear_some(lrdata6)

In [18]:
lrdata6.to_csv("~/Desktop/lrdata_nw5.csv")

# Linear Regression

In [11]:
lrdata4.columns

Index(['Survey_ID', 'black', 'native', 'asian', 'hawaiian', 'other_race',
       'unsure_race', 'latinx', 'fst4', 'snd4', 'trd4', 'age_18-25',
       'age_26-35', 'age_36-45', 'age_46-55', 'age_56-65', 'age_66+',
       'age_Under 18', 'gender_Gender non-conforming or non-binary or other',
       'gender_Man', 'gender_Prefer not to say', 'gender_Woman'],
      dtype='object')

In [12]:
lrdata4.shape

(1260, 22)

In [13]:
def reg_on_123(dataset4lr, dropX=[]):
    y1 = dataset4lr["fst4"]
    y2 = dataset4lr["snd4"]
    y3 = dataset4lr["trd4"]

    X = dataset4lr.drop(["Survey_ID", "fst4", "snd4", "trd4", "unsure_race", "age_Under 18",
                         "gender_Gender non-conforming or non-binary or other"]+dropX, axis=1)
    X = sm.add_constant(X)
    
    model1 = sm.OLS(y1, X).fit()
    model2 = sm.OLS(y2, X).fit()
    model3 = sm.OLS(y3, X).fit()
    
    return model1, model2, model3

def return_plist(dataset4lr, dropX=[]):
    model1, model2, model3 = reg_on_123(dataset4lr, dropX)
    
    a = (model1.summary2().tables[1]['P>|t|'] <= 0.05) | (model1.summary2().tables[1]['P>|t|'] >= 0.95)
    a1 = model1.summary2().tables[1]["Coef."]
    b = (model2.summary2().tables[1]['P>|t|'] <= 0.05) | (model2.summary2().tables[1]['P>|t|'] >= 0.95)
    b1 = model2.summary2().tables[1]["Coef."]
    c = (model3.summary2().tables[1]['P>|t|'] <= 0.05) | (model3.summary2().tables[1]['P>|t|'] >= 0.95)
    c1 = model3.summary2().tables[1]["Coef."]
    
    p_df = pd.concat([a1, a, b1, b, c1, c], axis=1)
    p_df.columns = ["fst_inlist_coef", "p<=0.05", "snd_inlist_coef", "p<=0.05", "trd_inlist_coef", "p<=0.05"]
    
    return p_df

### Keep All
These groups of models only excluded some variables to clear perfect multicollinearity.
The variables excluded for perfect multicollinearity include "Survey_ID", "fst1n2", "snd1n2", "trd1n2", "unsure_race","age_Under 18", "gender_Gender non-conforming or non-binary or other".

In [14]:
# All_3095
S3095_all = return_plist(lrdata4)
# NoViloentRace_3082
S3082_all = return_plist(lrdata4)
# NoViloentRaceWhiteBlack_3053
S3053_all = return_plist(lrdata4)

  return ptp(axis=axis, out=out, **kwargs)


In [15]:
print(S3095_all)

                          fst_inlist_coef  p<=0.05  snd_inlist_coef  p<=0.05  \
const                           -0.170852    False         0.262488    False   
black                            0.027052    False        -0.003425    False   
native                           0.009044    False         0.038654    False   
asian                           -0.024404    False        -0.002118     True   
hawaiian                        -0.111100    False        -0.047846    False   
other_race                      -0.018530    False         0.011895    False   
latinx                           0.052216    False        -0.096531     True   
age_18-25                        0.103636    False        -0.084340    False   
age_26-35                        0.146508    False        -0.064457    False   
age_36-45                        0.191294    False        -0.125978    False   
age_46-55                        0.204394    False        -0.095178    False   
age_56-65                        0.17888

In [16]:
print(S3082_all)

                          fst_inlist_coef  p<=0.05  snd_inlist_coef  p<=0.05  \
const                           -0.170852    False         0.262488    False   
black                            0.027052    False        -0.003425    False   
native                           0.009044    False         0.038654    False   
asian                           -0.024404    False        -0.002118     True   
hawaiian                        -0.111100    False        -0.047846    False   
other_race                      -0.018530    False         0.011895    False   
latinx                           0.052216    False        -0.096531     True   
age_18-25                        0.103636    False        -0.084340    False   
age_26-35                        0.146508    False        -0.064457    False   
age_36-45                        0.191294    False        -0.125978    False   
age_46-55                        0.204394    False        -0.095178    False   
age_56-65                        0.17888

In [17]:
print(S3053_all)

                          fst_inlist_coef  p<=0.05  snd_inlist_coef  p<=0.05  \
const                           -0.170852    False         0.262488    False   
black                            0.027052    False        -0.003425    False   
native                           0.009044    False         0.038654    False   
asian                           -0.024404    False        -0.002118     True   
hawaiian                        -0.111100    False        -0.047846    False   
other_race                      -0.018530    False         0.011895    False   
latinx                           0.052216    False        -0.096531     True   
age_18-25                        0.103636    False        -0.084340    False   
age_26-35                        0.146508    False        -0.064457    False   
age_36-45                        0.191294    False        -0.125978    False   
age_46-55                        0.204394    False        -0.095178    False   
age_56-65                        0.17888

### No Black
Other than same variables above to clear perfect multicollinearity, these groups of models also excluded "black".

In [18]:
nb = ["black"]

In [19]:
# All_3095
S3095_nb = return_plist(lrdata4, nb)
# NoRace_3082
S3082_nb = return_plist(lrdata5, nb)
# NoRaceWhiteBlack_3053
S3053_nb = return_plist(lrdata6, nb)

In [20]:
print(S3095_nb)

                          fst_inlist_coef  p<=0.05  snd_inlist_coef  p<=0.05  \
const                           -0.148845    False         0.259702    False   
native                          -0.005117    False         0.040447    False   
asian                           -0.037532    False        -0.000455     True   
hawaiian                        -0.123744    False        -0.046245    False   
other_race                      -0.033632    False         0.013807    False   
latinx                           0.045154    False        -0.095637     True   
age_18-25                        0.107631    False        -0.084846    False   
age_26-35                        0.149987    False        -0.064898    False   
age_36-45                        0.194888    False        -0.126433    False   
age_46-55                        0.208026    False        -0.095638    False   
age_56-65                        0.182578    False        -0.042819    False   
age_66+                          0.05296

In [21]:
print(S3082_nb)

                          fst_inlist_coef  p<=0.05  snd_inlist_coef  p<=0.05  \
const                            0.304276    False         0.136143    False   
native                           0.025566    False        -0.026387    False   
asian                           -0.065539    False         0.095247    False   
hawaiian                        -0.081006    False        -0.126140    False   
other_race                       0.009232    False         0.004000    False   
latinx                          -0.021593    False         0.065882     True   
age_18-25                       -0.065607    False         0.033804    False   
age_26-35                       -0.085781    False         0.007143     True   
age_36-45                       -0.036116    False         0.013058    False   
age_46-55                       -0.047326    False         0.004907     True   
age_56-65                       -0.060026    False        -0.051176    False   
age_66+                          0.00337

In [22]:
S3053_nb.to_csv("~/Desktop/b.csv")

In [30]:
print(S3053_nb)

                          fst_inlist_coef  p<=0.05  snd_inlist_coef  p<=0.05  \
const                            0.667824     True         0.046075    False   
native                          -0.022704    False         0.053612    False   
asian                           -0.104465    False        -0.040499    False   
hawaiian                        -0.109611    False        -0.019065    False   
other_race                       0.039431    False        -0.016797    False   
latinx                          -0.011876    False        -0.023016    False   
age_18-25                       -0.186673    False         0.011847    False   
age_26-35                       -0.257080    False         0.013756    False   
age_36-45                       -0.292215    False         0.041047    False   
age_46-55                       -0.264013    False         0.010029     True   
age_56-65                       -0.272530    False         0.051660    False   
age_66+                         -0.18337

### Only Black
Other than same variables above to clear perfect multicollinearity, these groups of models also excluded all race binary variables excpet "black".

In [23]:
ob = ["native", "asian", "hawaiian", "other_race", "unsure_race", "latinx"]

In [24]:
# All_3095
S3095_ob = return_plist(lrdata4, ob)
# NoRace_3082
S3082_ob = return_plist(lrdata5, ob)
# NoRaceWhiteBlack_3053
S3053_ob = return_plist(lrdata6, ob)

In [25]:
print(S3095_ob)

                          fst_inlist_coef  p<=0.05  snd_inlist_coef  p<=0.05  \
const                           -0.150271    False         0.228824    False   
black                            0.015737    False         0.019847    False   
age_18-25                        0.104742    False        -0.070482    False   
age_26-35                        0.144662    False        -0.044567    False   
age_36-45                        0.185007    False        -0.100060    False   
age_46-55                        0.200418    False        -0.067811    False   
age_56-65                        0.171827    False        -0.015747    False   
age_66+                          0.041578    False        -0.073434    False   
gender_Man                       0.147123    False        -0.032794    False   
gender_Prefer not to say         0.093786    False        -0.120566    False   
gender_Woman                     0.127709    False        -0.002905     True   

                          trd_inlist_co

In [26]:
S3095_ob.to_csv("~/Desktop/b.csv")

In [27]:
print(S3082_ob)

                          fst_inlist_coef  p<=0.05  snd_inlist_coef  p<=0.05  \
const                            0.296349    False         0.134771    False   
black                            0.001504    False        -0.046602     True   
age_18-25                       -0.054609    False         0.072436    False   
age_26-35                       -0.072554    False         0.038174    False   
age_36-45                       -0.022050    False         0.039479    False   
age_46-55                       -0.032462    False         0.034466    False   
age_56-65                       -0.045433    False        -0.022273    False   
age_66+                          0.023152    False        -0.054171    False   
gender_Man                      -0.096650    False         0.074550    False   
gender_Prefer not to say        -0.223356    False         0.090491    False   
gender_Woman                    -0.113558    False         0.049403    False   

                          trd_inlist_co

In [28]:
print(S3053_ob)

                          fst_inlist_coef  p<=0.05  snd_inlist_coef  p<=0.05  \
const                            0.610565     True         0.078262    False   
black                            0.013414    False        -0.012108    False   
age_18-25                       -0.147198    False        -0.003936     True   
age_26-35                       -0.215913    False        -0.000413     True   
age_36-45                       -0.252211    False         0.029452    False   
age_46-55                       -0.226424    False         0.001780     True   
age_56-65                       -0.229565    False         0.039657    False   
age_66+                         -0.138711    False         0.022667    False   
gender_Man                      -0.213248    False         0.124160    False   
gender_Prefer not to say        -0.288428     True         0.161191    False   
gender_Woman                    -0.246964     True         0.055391    False   

                          trd_inlist_co

In [29]:
S3053_ob.to_csv("~/Desktop/a.csv")