# Initialization

In [38]:
import pandas as pd
import numpy as np

In [39]:
TRAIN_PATH = 'UtkMl_dataset/train.csv'
TEST_PATH = 'datasets/test.csv'

VOCAB_SIZE = 2500

## Read Data

INDEX, DOC_ID, WORD_ID, LABEL

In [40]:
sparse_train_data = np.loadtxt('trainning_data.txt', delimiter=' ', dtype=str)
sparse_test_data = np.loadtxt('test_data.txt', delimiter=' ', dtype=str)
grouped_data = np.loadtxt('grouped_data.txt', delimiter=' ', dtype=str)

In [41]:
data = pd.read_csv(TRAIN_PATH)
data

Unnamed: 0,Tweet,following,followers,actions,is_retweet,location,Type,Unnamed: 7
0,Good Morning Love @LeeBrown_V,0.0,0.0,0.0,0.0,"Pennsylvania, USA",Quality,
1,'@realDonaldTrump @USNavy RIP TO HEROES',42096.0,61060.0,5001.0,0.0,"South Padre Island, Texas",Spam,
2,Haven't been following the news but I understa...,0.0,0.0,,0.0,Will never be broke ever again,Quality,
3,pic.twitter.com/dy9q4ftLhZ What to do with pap...,0.0,0.0,0.0,0.0,Mundo,Quality,
4,#DidYouKnow ► Mahatma Gandhi made a brief visi...,17800.0,35100.0,,0.0,"Nottingham, England",Quality,
...,...,...,...,...,...,...,...,...
14894,"#AllWentWrongWhen I told my hair stylist to ""g...",695.0,533.0,868.0,1.0,United States,Spam,
14895,"They don't have to like you, and you don't hav...",0.0,0.0,0.0,0.0,,Quality,
14896,#Miami Graham Nash Live at Parker Playhouse #...,5647.0,15091.0,5823.0,0.0,United States,Spam,
14897,@bethannhamilton is in the business of one-upp...,0.0,0.0,,0.0,"Southgate, MI",Quality,


In [42]:
vocab = pd.read_csv('vocab.csv')
vocab

Unnamed: 0,WORD_ID,VOCAB_WORD
0,0,http
1,1,trump
2,2,news
3,3,get
4,4,new
...,...,...
2495,2495,moto
2496,2496,yell
2497,2497,butter
2498,2498,lack


# Creating a Decision Tree

## For practice get 4 rows from train data

In [43]:
four_samples = data.sample(4)

print(type(four_samples))
four_samples

<class 'pandas.core.frame.DataFrame'>


Unnamed: 0,Tweet,following,followers,actions,is_retweet,location,Type,Unnamed: 7
6557,Niall & the boyspic.twitter.com/iq0Lo0m2ux,60000.0,76700.0,,0.0,luke followed me for Christmas,Quality,
12024,Willie Nelson postpones two more shows because...,898.0,563.0,2160.0,1.0,United States,Spam,
8986,Disturbing questions linger in Joliet-area tod...,11685.0,21443.0,49491.0,0.0,United States,Spam,
6514,@haleyymichele HAPPY BIRTHDAY HALES #SpartyDow...,0.0,0.0,,0.0,"Iowa City, IA",Quality,


In [44]:
four_samples.iloc[0]

Tweet         Niall & the boyspic.twitter.com/iq0Lo0m2ux
following                                        60000.0
followers                                        76700.0
actions                                              NaN
is_retweet                                           0.0
location                  luke followed me for Christmas
Type                                             Quality
Unnamed: 7                                           NaN
Name: 6557, dtype: object

In [45]:
''' Columns: DOC_ID, WORD_ID, LABEL, OCCURENCE '''

# Get the words in the sample after NLP from first_doc_id
# TODO: need to find a spot for this

first_doc_id = four_samples.index[0]
print('first doc id:', first_doc_id)
for row in grouped_data:
    if row[0] == str(first_doc_id):
        print(row)

first doc id: 6557


### Functions for decisions
TODO: We want the number of spam words versus non spam words

In [46]:
# Function that checks against followers vs followees

# Returns true if following zero people

# Returns the ratio of followers to following
def _percent_followers_following(row):
    return row.followers / row.following
    
# big enough ratio to satisfy min_percent
def min_percent_followers(row, min_percent=.7):
    if row.following == 0: return True
    return _percent_followers_following(row) > min_percent

# Enough followers
def enough_followers(row, minimum_followers=10):
    return row.followers > minimum_followers

# is retweet
def is_retweet(row)-> bool:
    return True if row.is_retweet else False
    

In [47]:
is_retweet(four_samples.iloc[0])

False

In [48]:
print(min_percent_followers(four_samples.iloc[0]))

True


## Create a decision tree using the bootstrapped dataset, but only use a random subset of variables at each step.

In [49]:
''' 
Decision Tree -- 1

returns whether true (spam) or false (not spam)

Structure:

1. Enough Followers
2. Minimum Percent Followers

''' 

# Testing tree one
def tree1(row):
    
    # First check if enough followers
    if(not enough_followers(row)):
        return False
    
    if(not min_percent_followers(row)):
        return False
    
    return True
    
    # Then check if Ratio of followers to following is high enough

In [50]:
''' 
Decision Tree -- 2

returns whether true (spam) or false (not spam)

Structure:

1. Minimum Percent Followers
2. Enough Followers

''' 
def tree2(row):
     
    # First check if enough followers
    if(not enough_followers(row)):
        return False
    
    if(not min_percent_followers(row)):
        return False
    
    return True   

In [51]:
''' 
Decision Tree -- 3

returns whether true (spam) or false (not spam)

Structure:

     enough_followers
    /            \
is_retweet    is_retweet

''' 
def tree3(row):
     
    if min_percent_followers(row):
        return True if is_retweet(row) else False
    
    else:
        return True if is_retweet(row) else False


In [52]:
''' 
Decision Tree -- 4

returns whether true (spam) or false (not spam)

Structure:

           is_retweet
           /         \
 Enough_Followers  Enough_Followers
''' 

def tree4(row):
    # First check if enough followers
    if is_retweet(row):
        return True if enough_followers(row) else False
    else: # not enough follower
        return True if enough_followers(row) else False
        

In [67]:
''' 
Decision Tree -- 5

returns whether true (spam) or false (not spam)

Structure:

           is_retweet
           /         \
 Enough_Followers  minimum_percent_followers
''' 

def tree5(row):
    # First check if enough followers
    if is_retweet(row):
        return True if enough_followers(row) else False
    else: # not enough follower
        return True if min_percent_followers(row) else False
        

## Running our random forest

In [60]:
data[:2]

Unnamed: 0,Tweet,following,followers,actions,is_retweet,location,Type,Unnamed: 7
0,Good Morning Love @LeeBrown_V,0.0,0.0,0.0,0.0,"Pennsylvania, USA",Quality,
1,'@realDonaldTrump @USNavy RIP TO HEROES',42096.0,61060.0,5001.0,0.0,"South Padre Island, Texas",Spam,


In [73]:
# creating a list of our decision trees

trees = [tree1, tree2, tree3, tree4, tree5]

ham_count = 0
spam_count = 0
res = []

for i, sample in enumerate(range(data.shape[0])):
    res.append([data.iloc[i][0], 0, 0, "Uninitialized"]) # [tweetId, spam, ham, Final Classification]
    
    for tree in trees:
        if tree(data.iloc[sample]):
            res[-1][1] += 1  # spam++
        else:
            res[-1][2] += 1 # ham++
    
    # Final Classification
    if res[1] > res[2]:
       res[3] = 'Spam'
    else:
        res[3] = 'Ham'
    

IndexError: list index out of range

In [71]:
print(res[:1])

[['Good Morning Love  @LeeBrown_V', 2, 3]]
