# Data Importing

In [1]:
# Importing modules
import pandas as pd
import numpy as np
import scipy.stats as stats
import statsmodels.api as sm

from sklearn.linear_model import LinearRegression

# Read data into papers
data = pd.read_csv("/Users/atchoo/Google Drive/Research Assistant/Work With David Knight/Survey of the Incarcerated/TMPPoliticalSurveyFULL_ForDavid.csv", encoding = "ISO-8859-1")

# Print head
# print(data.columns)

In [2]:
# Insert an id index for future work
data["Survey_ID"] = data.index + 1

# Remove the columns
str_data = data[["Survey_ID", "explain_politics_changed_since_incarcerated",\
                 'identifies_as_black', 'identifies_as_white',
                 "identifies_as_native", 'identifies_as_asian', 
                 'identifies_as_hawaiian_or_pac_islander', 'identifies_as_other_race',
                 'identifies_as_not_sure_of_race', 'identifies_as_hispanic_or_latinx',
                 "age", "gender"]]
# Print out the first rows of papers
str_data.columns = ["Survey_ID", "p_change", "black", "white", "native", "asian",
                   "hawaiian", "other_race", "unsure_race", "latinx", "age", "gender"]
str_data = str_data.dropna()

# Make Remove Stopword Functions

In [3]:
# Load the regular expression library and the nltk word library
import re
import nltk

# Create a function to remove nonsense words
words = set(nltk.corpus.words.words())
def clean_sent(sent):
    return " ".join(w for w in nltk.wordpunct_tokenize(sent) \
     if w.lower() in words or not w.isalpha())

strv_list = ["p_change"]
for strv in strv_list:
    # Remove punctuation & Convert the titles to lowercase
    str_data[strv] = str_data[strv].map(lambda x: re.sub(r'[^\w\s]', "", x)).map(lambda x: x.lower())
    # Remove nonsense words
    str_data[strv] = str_data[strv].apply(clean_sent)
    # Transform blank cells to NaN & Drop NaN
    str_data = str_data.replace(r'^\s*$', np.nan, regex=True).dropna()

In [4]:
str_data.head()

Unnamed: 0,Survey_ID,p_change,black,white,native,asian,hawaiian,other_race,unsure_race,latinx,age,gender
0,1,no,False,True,False,False,False,False,False,False,46-55,Man
2,3,i feel that the parole board the some of the p...,False,True,False,False,False,False,False,False,46-55,Woman
3,4,yes bit,False,True,False,False,False,False,False,False,36-45,Woman
5,6,most are full of,True,False,False,False,False,False,False,False,26-35,Woman
6,7,i never considered the prison population or pr...,False,True,False,False,False,False,False,False,46-55,Woman


In [5]:
# remove stop words
# nltk.download('stopwords')
from nltk.corpus import stopwords
from gensim.utils import simple_preprocess

def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

# Data Exporting for C++

In [6]:
extrastopword = ["prison"]

In [7]:
stop_words = stopwords.words("english")
stop_words.extend(["don", "people", "bill", "step", "act", "first", "u", "n", 
                   "na", "non"]+extrastopword)
yesnno = ["yes", "no"]
stop_words =  [word for word in stop_words if word not in yesnno]

In [8]:
p_change = remove_stopwords(str_data["p_change"])
p_change = pd.DataFrame([x for x in p_change if x])
len(p_change)

3305

In [9]:
#p_change.to_csv('/Users/atchoo/Desktop/p_change3305.txt', header=None, index=None, sep=' ', mode='a')

# Include the Topic info

In [10]:
def add_the_matrix(extrastopword, csvfile, tlist):
    
    assert isinstance(extrastopword, list), "Should be a list of the extra stop words."
    assert isinstance(csvfile, str), "Should be the path of the matrix file."
    
    # change the stop words
    stop_words = stopwords.words("english")
    stop_words.extend(["don", "people", "bill", "step", "act", "first", "u", "n", 
                       "na", "non"]+extrastopword)
    yesnno = ["yes", "no"]
    stop_words =  [word for word in stop_words if word not in yesnno]
    # remove stop words
    p_change = [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in str_data["p_change"]]
    p_change = pd.DataFrame([x for x in p_change if x])
    
    # create index variable for whether a row is left
    p_change2 = []
    for doc in str_data["p_change"]:
        if all(word in stop_words for word in simple_preprocess(doc)):
               i_str = False
        else: 
               i_str = True
        p_change2.append(i_str)
    inBTM = pd.DataFrame(p_change2, columns=["inBTM"])
    # filter str_data based on inBTM result into str_datar
    str_datar = pd.concat([str_data.reset_index(drop=True), inBTM], axis=1)
    str_datar = str_datar[str_datar["inBTM"]==True]
    
    # read in the topicmatrix file
    matrix = pd.read_csv(csvfile, 
                         names=["topic 1", "topic 2", "topic 3", "topic 4", 
                                "topic 5", "topic 6", "topic 7", "topic 8", 
                                "topic 9", "topic 10", "topic 11", "topic 12"])
    a = matrix.idxmax(axis=1)
    b = matrix.T.apply(lambda x: x.nlargest(2).idxmin())
    c = matrix.T.apply(lambda x: x.nlargest(3).idxmin())
    topic = pd.concat([a, b, c], axis=1, ignore_index=False)

    # concatenate topic matrix and str_datar as str_datar
    str_datar2 = pd.concat([str_datar.reset_index(drop=True), topic], axis=1)
    str_datar2 = str_datar2.rename(columns={0:"fsttopic", 1:"sndtopic", 2:"trdtopic"})

    # CREATE THE CHISQUARE DATAFRAME
    race = ["black", "white", "native", "asian", "hawaiian", "other_race", "unsure_race", "latinx"]
    topic = ["fsttopic", "sndtopic", "trdtopic"]
    chisq = str_datar2.loc[:, ["Survey_ID"]+race+topic].dropna()
        ## clean the race variable a little bit
    chisq.loc[:, race] = chisq.loc[:, race].astype(int)
    chisq["Race"] = chisq.loc[:, race].idxmax(axis=1)
    
    # CREATE THE REGRESSION DATAFRAME
    chisq["fstinlist"] = chisq["fsttopic"].isin(tlist)
    chisq["sndinlist"] = chisq["sndtopic"].isin(tlist)
    chisq["trdinlist"] = chisq["trdtopic"].isin(tlist)
    cor = chisq.drop(["fsttopic", "sndtopic", "trdtopic", "Race"], axis=1)
    cov = str_data[["Survey_ID", "age", "gender"]]
    cov = pd.get_dummies(cov, columns=["age", "gender"])
    lrdata = cor.merge(cov, on="Survey_ID")
    
    return str_datar2, chisq, lrdata

In [11]:
common_csv = "~/Google Drive/Research Assistant/Work With David Knight/Survey of the Incarcerated/BTM/BTM_Regression_Exploration_PC/"
extra1 = ["prison"]
csv1 = "12topics/topicmatrix.csv"

In [12]:
strdata1, chisq1, lrdata1 = add_the_matrix(extra1, common_csv+csv1, tlist=["topic 7"])

In [13]:
#strdata1.to_csv("~/Desktop/10topic.csv")

In [14]:
lrdata1.shape

(3305, 23)

In [15]:
lrdata1.head()

Unnamed: 0,Survey_ID,black,white,native,asian,hawaiian,other_race,unsure_race,latinx,fstinlist,...,age_26-35,age_36-45,age_46-55,age_56-65,age_66+,age_Under 18,gender_Gender non-conforming or non-binary or other,gender_Man,gender_Prefer not to say,gender_Woman
0,1,0,1,0,0,0,0,0,0,True,...,0,0,1,0,0,0,0,1,0,0
1,3,0,1,0,0,0,0,0,0,False,...,0,0,1,0,0,0,0,0,0,1
2,4,0,1,0,0,0,0,0,0,False,...,0,1,0,0,0,0,0,0,0,1
3,6,1,0,0,0,0,0,0,0,False,...,1,0,0,0,0,0,0,0,0,1
4,7,0,1,0,0,0,0,0,0,False,...,0,0,1,0,0,0,0,0,0,1


# Chisq Test

In [16]:
def to_obs(ct):
    obs = np.array([ct.iloc[0][0:7].values,
                    ct.iloc[1][0:7].values,
                    ct.iloc[2][0:7].values,
                    ct.iloc[3][0:7].values,
                    ct.iloc[4][0:7].values,
                    ct.iloc[5][0:7].values,
                    ct.iloc[6][0:7].values,
                    ct.iloc[7][0:7].values])
    return obs

def ctest_race_topic(dataset4chisq):
    ct = pd.crosstab(dataset4chisq.Race, dataset4chisq.fsttopic, margins=True)
    obs = to_obs(ct)
    fst_p = stats.chi2_contingency(obs)[1]
    
    ct = pd.crosstab(dataset4chisq.Race, dataset4chisq.sndtopic, margins=True)
    obs = to_obs(ct)
    snd_p = stats.chi2_contingency(obs)[1]
    
    ct = pd.crosstab(dataset4chisq.Race, dataset4chisq.trdtopic, margins=True)
    obs = to_obs(ct)
    trd_p = stats.chi2_contingency(obs)[1]
    
    result_dic = {"fst_p":fst_p, "snd_p":snd_p, "trd_p":trd_p}
    print(result_dic)
    
    pass

In [17]:
ctest_race_topic(chisq1)

{'fst_p': 0.2358922952493069, 'snd_p': 0.4554223750039456, 'trd_p': 0.6332820374735445}


# Linear Regression

In [18]:
def reg_on_123(dataset4lr, dropX=[]):
    y1 = dataset4lr["fstinlist"]
    y2 = dataset4lr["sndinlist"]
    y3 = dataset4lr["trdinlist"]

    X = dataset4lr.drop(["Survey_ID", "fstinlist", "sndinlist", "trdinlist", "unsure_race", "age_Under 18",
                         "gender_Gender non-conforming or non-binary or other"]+dropX, axis=1)
    X = sm.add_constant(X)
    
    model1 = sm.OLS(y1, X).fit()
    model2 = sm.OLS(y2, X).fit()
    model3 = sm.OLS(y3, X).fit()
    
    return model1, model2, model3

def return_plist(dataset4lr, dropX=[]):
    model1, model2, model3 = reg_on_123(dataset4lr, dropX)
    
    a = (model1.summary2().tables[1]['P>|t|'] <= 0.05) | (model1.summary2().tables[1]['P>|t|'] >= 0.95)
    a1 = model1.summary2().tables[1]["Coef."]
    b = (model2.summary2().tables[1]['P>|t|'] <= 0.05) | (model2.summary2().tables[1]['P>|t|'] >= 0.95)
    b1 = model2.summary2().tables[1]["Coef."]
    c = (model3.summary2().tables[1]['P>|t|'] <= 0.05) | (model3.summary2().tables[1]['P>|t|'] >= 0.95)
    c1 = model3.summary2().tables[1]["Coef."]
    
    p_df = pd.concat([a1, a, b1, b, c1, c], axis=1)
    p_df.columns = ["fst_inlist_coef", "p<=0.05", "snd_inlist_coef", "p<=0.05", "trd_inlist_coef", "p<=0.05"]
    
    return p_df

In [19]:
#lrdata1.to_csv("~/Desktop/lrdata1.csv")

### Keep All
These groups of models only excluded some variables to clear perfect multicollinearity.
The variables excluded for perfect multicollinearity include "Survey_ID", "fst1n2", "snd1n2", "trd1n2", "unsure_race","age_Under 18", "gender_Gender non-conforming or non-binary or other".

In [20]:
T10 = return_plist(lrdata1)

In [34]:
print(T10)
T10.to_csv("~/Desktop/T10.csv")

                          fst_inlist_coef  p<=0.05  snd_inlist_coef  p<=0.05  \
const                            0.242306     True         0.014679    False   
black                            0.000706     True        -0.002942    False   
white                           -0.025534    False        -0.016283    False   
native                           0.007550    False        -0.004447    False   
asian                           -0.074682    False         0.071954     True   
hawaiian                         0.043459    False        -0.004995    False   
other_race                      -0.018654    False        -0.001318    False   
latinx                          -0.007539    False        -0.001656    False   
age_18-25                       -0.119890    False         0.047145    False   
age_26-35                       -0.128821    False         0.041569    False   
age_36-45                       -0.136130    False         0.048550    False   
age_46-55                       -0.12247

### No White
Other than same variables above to clear perfect multicollinearity, these groups of models also excluded "white".

In [22]:
nw = ["white"]

In [23]:
T10_nw = return_plist(lrdata1, nw)

In [24]:
print(T10_nw)
T10_nw.to_csv("~/Desktop/T10_nw.csv")

                          fst_inlist_coef  p<=0.05  snd_inlist_coef  p<=0.05  \
const                            0.221622    False         0.001488     True   
black                            0.019049    False         0.008756    False   
native                           0.011106    False        -0.002179    False   
asian                           -0.076842    False         0.070577     True   
hawaiian                         0.037351    False        -0.008889    False   
other_race                      -0.009533    False         0.004498    False   
latinx                          -0.000119     True         0.003075    False   
age_18-25                       -0.124134    False         0.044439    False   
age_26-35                       -0.133038    False         0.038879    False   
age_36-45                       -0.140598    False         0.045701    False   
age_46-55                       -0.126877    False         0.047057    False   
age_56-65                       -0.09991

### Only White
Other than same variables above to clear perfect multicollinearity, these groups of models also excluded all race binary variables excpet "white".

In [25]:
ow = ["black", "native", "asian", "hawaiian", "other_race", "unsure_race", "latinx"]

In [26]:
T10_ow = return_plist(lrdata1, ow)

In [27]:
print(T10_ow)
T10_ow.to_csv("~/Desktop/T10_ow.csv")

                          fst_inlist_coef  p<=0.05  snd_inlist_coef  p<=0.05  \
const                            0.240266     True         0.010900    False   
white                           -0.020184    False        -0.014274     True   
age_18-25                       -0.129917    False         0.050680    False   
age_26-35                       -0.137537    False         0.044458    False   
age_36-45                       -0.144592    False         0.051404    False   
age_46-55                       -0.129119    False         0.052504    False   
age_56-65                       -0.101766    False         0.035279    False   
age_66+                         -0.103011    False         0.007507    False   
gender_Man                       0.016943    False        -0.008877    False   
gender_Prefer not to say         0.057590    False        -0.008315    False   
gender_Woman                     0.016582    False        -0.005395    False   

                          trd_inlist_co

### No Black
Other than same variables above to clear perfect multicollinearity, these groups of models also excluded "black".

In [28]:
nb = ["black"]

In [29]:
T10_nb = return_plist(lrdata1, nb)

In [30]:
print(T10_nb)
T10_nb.to_csv("~/Desktop/T10_nb.csv")

                          fst_inlist_coef  p<=0.05  snd_inlist_coef  p<=0.05  \
const                            0.242822     True         0.012533    False   
white                           -0.025946     True        -0.014566    False   
native                           0.007449    False        -0.004025    False   
asian                           -0.074593    False         0.071584     True   
hawaiian                         0.043615    False        -0.005647    False   
other_race                      -0.018844    False        -0.000526     True   
latinx                          -0.007702    False        -0.000976    False   
age_18-25                       -0.119851    False         0.046982    False   
age_26-35                       -0.128827    False         0.041595    False   
age_36-45                       -0.136138    False         0.048583    False   
age_46-55                       -0.122495    False         0.049934    False   
age_56-65                       -0.09519

### Only Black
Other than same variables above to clear perfect multicollinearity, these groups of models also excluded all race binary variables excpet "black".

In [31]:
ob = ["white", "native", "asian", "hawaiian", "other_race", "unsure_race", "latinx"]

In [32]:
T10_ob = return_plist(lrdata1, ob)

In [33]:
print(T10_ob)

                          fst_inlist_coef  p<=0.05  snd_inlist_coef  p<=0.05  \
const                            0.228346    False         0.003942     True   
black                            0.018779    False         0.008774    False   
age_18-25                       -0.133611    False         0.047840    False   
age_26-35                       -0.141230    False         0.041147    False   
age_36-45                       -0.148730    False         0.047740    False   
age_46-55                       -0.134221    False         0.048148    False   
age_56-65                       -0.107202    False         0.030738    False   
age_66+                         -0.108252    False         0.002809     True   
gender_Man                       0.017046    False        -0.008506    False   
gender_Prefer not to say         0.061104    False        -0.005548    False   
gender_Woman                     0.014617    False        -0.006825    False   

                          trd_inlist_co