In [1]:
import pandas as pd
import numpy as np


In [2]:
data=pd.read_csv("data.csv",index_col=None,names=["tweet","label"])

In [3]:
data.shape

(1066, 2)

In [4]:
data.columns

Index(['tweet', 'label'], dtype='object')

In [5]:
data["label"].value_counts()

Non-Bullying    638
Bullying        427
Text Label        1
Name: label, dtype: int64

In [6]:
data=data.drop(0,axis=0)


In [7]:
data["labeln"]=[0 if i=="Non-Bullying" else 1 for i in data.label]

In [8]:
data.reset_index(drop=True, inplace=True)

In [9]:
data

Unnamed: 0,tweet,label,labeln
0,.omg why are poc wearing fugly blue contacts s...,Non-Bullying,0
1,.Sorry but most of the runners popular right n...,Non-Bullying,0
2,".those jeans are hideous, and I?m afraid he?s ...",Non-Bullying,0
3,.I had to dress up for a presentation in class...,Non-Bullying,0
4,.Am I the only one who thinks justin bieber is...,Non-Bullying,0
...,...,...,...
1060,"No we are not, But you are a race baiting libt...",Bullying,1
1061,"you wont get anyone for this challenge., after...",Bullying,1
1062,"I will follow you if you are not a libtard,Mus...",Bullying,1
1063,"michaelianblack Ur a child, an ostrich w/ your...",Bullying,1


In [10]:
import nltk
import numpy as np
import pandas as pd
from nltk.corpus import twitter_samples 
import nltk
nltk.download('stopwords')
from sklearn.model_selection import train_test_split

[nltk_data] Downloading package stopwords to C:\Users\Almas
[nltk_data]     Ansari\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [11]:
import re
import string
import numpy as np

from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import TweetTokenizer


def process_tweet(tweet):
    """Process tweet function.
    Input:
        tweet: a string containing a tweet
    Output:
        tweets_clean: a list of words containing the processed tweet

    """
    stemmer = PorterStemmer()
    stopwords_english = stopwords.words('english')
    # remove stock market tickers like $GE
    tweet = re.sub(r'\$\w*', '', tweet)
    # remove old style retweet text "RT"
    tweet = re.sub(r'^RT[\s]+', '', tweet)
    # remove hyperlinks
    tweet = re.sub(r'https?:\/\/.*[\r\n]*', '', tweet)
    # remove hashtags
    # only removing the hash # sign from the word
    tweet = re.sub(r'#', '', tweet)
    # tokenize tweets
    tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True,
                               reduce_len=True)
    tweet_tokens = tokenizer.tokenize(tweet)

    tweets_clean = []
    for word in tweet_tokens:
        if (word not in stopwords_english and  # remove stopwords
                word not in string.punctuation):  # remove punctuation
            # tweets_clean.append(word)
            stem_word = stemmer.stem(word)  # stemming word
            tweets_clean.append(stem_word)

    return tweets_clean


def build_freqs(tweets, ys):
    """Build frequencies.
    Input:
        tweets: a list of tweets
        ys: an m x 1 array with the sentiment label of each tweet
            (either 0 or 1)
    Output:
        freqs: a dictionary mapping each (word, sentiment) pair to its
        frequency
    """
    # Convert np array to list since zip needs an iterable.
    # The squeeze is necessary or the list ends up with one element.
    # Also note that this is just a NOP if ys is already a list.
    yslist = np.squeeze(ys).tolist()

    # Start with an empty dictionary and populate it by looping over all tweets
    # and over all processed words in each tweet.
    freqs = {}
    for y, tweet in zip(yslist, tweets):
        for word in process_tweet(tweet):
            pair = (word, y)
            if pair in freqs:
                freqs[pair] += 1
            else:
                freqs[pair] = 1

    return freqs


In [12]:
X=data.tweet
Y=data.labeln
X_train,X_test,y_train,y_test=train_test_split(X,Y)

In [13]:
freqs = build_freqs(X_train, y_train)


In [14]:
def extract_features(tweet, freqs):
    '''
    Input: 
        tweet: a list of words for one tweet
        freqs: a dictionary corresponding to the frequencies of each tuple (word, label)
    Output: 
        x: a feature vector of dimension (1,3)
    '''
    # process_tweet tokenizes, stems, and removes stopwords
    word_l = process_tweet(tweet)
    
    
    x = np.zeros((1, 2)) 
    
    
    
    ### START CODE HERE (REPLACE INSTANCES OF 'None' with your code) ###
    
    # loop through each word in the list of words
    for word in word_l:
        
        # increment the word count for the positive label 1
        x[0,0] += freqs.get((word, 1.0),0)
        
        # increment the word count for the negative label 0
        x[0,1] += freqs.get((word, 0.0),0)
        
    ### END CODE HERE ###
    assert(x.shape == (1, 2))
    return x[0]

In [15]:
extract_features(data.tweet[1000],freqs)

array([29., 37.])

In [16]:
from sklearn.linear_model import LogisticRegression


In [17]:
lr=LogisticRegression()

In [18]:
X_train.index

Int64Index([ 427,  701,   25,  724,  716, 1020,  739,  412, 1019,  359,
            ...
             906,  517,  730,  696,  939,  214,    7,  664,   18,  613],
           dtype='int64', length=798)

In [19]:
X_train_fin=[extract_features(X_train[i],freqs) for i in X_train.index]

In [20]:
np.array(X_train_fin)

array([[89., 97.],
       [96., 40.],
       [22., 39.],
       ...,
       [68., 73.],
       [22., 60.],
       [31., 30.]])

In [21]:
lr.fit(X_train_fin,y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [22]:
X_test_fin=[extract_features(X_test[i],freqs) for i in X_test.index]

In [23]:
y_pred=lr.predict(X_test_fin)

In [24]:
from sklearn.metrics import accuracy_score

In [25]:
accuracy_score(y_test,y_pred)

0.7265917602996255