# Uczenie maszynowe - Angry Tweets
## Bartosz Żabiński 129738

#### Import Bibliotek

In [2]:
import re
import pandas as pd
import nltk
import collections
from nltk.corpus import stopwords
import csv
import string
from sklearn.metrics import accuracy_score

#### Wczytanie zbioru danych

In [3]:
twt_train = pd.read_csv("train.csv", header=0 , sep=",", index_col=None, na_values="Not Available", dtype = str).dropna()
twt_train = twt_train.reset_index(drop=True)
twt_train = twt_train.iloc[:,2]
twt_train.head(10)

0    IOS 9 App Transport Security. Mm need to check...
1    Mar if you have an iOS device, you should down...
2    @jimmie_vanagon my phone does not run on lates...
3    Not sure how to start your publication on iOS?...
4    Two Dollar Tuesday is here with Forklift 2, Qu...
5    If you're not already signed up to test my iOS...
6    YouTube Gaming Officially Launches On Web, And...
7    YouTube Gaming Launches Tomorrow with iOS and ...
8    @astrill Yashan from BBC @bbcchinese the VPN a...
9    Parkrun app for iOS downloaded Where have you ...
Name: Tweet, dtype: object

#### Wczytanie zbioru danych

In [4]:
RE_SPACES = re.compile("\s+")
RE_HASHTAG = re.compile("[@#][_a-z0-9]+")
RE_EMOTICONS = re.compile("(:-?\))|(:p)|(:d+)|(:-?\()|(:/)|(;-?\))|(<3)|(=\))|(\)-?:)|(:'\()|(8\))")
RE_HTTP = re.compile("http(s)?://[/\.a-z0-9]+")
stopwords = nltk.corpus.stopwords.words('english')

class BeforeTokenizationNormalizer():
    @staticmethod
    def normalize(text):
        text = str(text).strip().lower()
        text = text.replace('&nbsp;', ' ')
        text = text.replace('&lt;', '<')
        text = text.replace('&gt;', '>')
        text = text.replace('&amp;', '&')
        text = text.replace('&pound;', u'£')
        text = text.replace('&euro;', u'€')
        text = text.replace('&copy;', u'©')
        text = text.replace('&reg;', u'®')
        text = text.replace('...', ' ')
        text = text.replace('``', ' ')
        text = text.replace('--', ' ')
        text = text.replace("''", ' ')
        return text

class Tokenizer():
    @staticmethod
    def tokenize(text):
        pass
    
class SimpleTokenizer(Tokenizer):
    @staticmethod
    def tokenize(text):
        return re.split(RE_SPACES,text)

#### Definicja tokenizatora wraz z usunięciem znaków interpunkcyjnych, najczęściej występujących słów języka oraz cyfr.

In [13]:
class TokenizeTwt(Tokenizer):
    @staticmethod
    def tokenize(text):
            
            tokens = SimpleTokenizer.tokenize(text)
            i = 0
            while i < len(tokens):
                token = tokens[i]
                match = re.search(RE_HASHTAG, token) or re.search(RE_EMOTICONS, token) or re.search(RE_HTTP, token)

                if match is not None:
                    del tokens[i]
                    i -= 1
                    match=None
                else:
                    del tokens[i]
                    tokens[i:i] = nltk.word_tokenize(token)
                i += 1
                
            tokens=  [tok.lower() for tok in tokens if tok not in string.punctuation ]
            tokens = [word for word in tokens if word not in stopwords]
            #tokens = [w for w in tokens if not re.search(r'\d', w)]
            stemmed_tokens =[]    
            for t in tokens:
                stemmed_tokens.append(nltk.stem.porter.PorterStemmer().stem(t))

            return stemmed_tokens
        

#### Tokenizacja

In [14]:
for i in twt_train.index:
    twt = BeforeTokenizationNormalizer.normalize(twt_train.iat[i])
    tknz = TokenizeTwt.tokenize(twt)
    print(tknz)

[u'io', u'9', u'app', u'transport', u'secur', u'mm', u'need', u'check', u'3rd', u'parti', u'network', u'pod', u'support']
[u'mar', u'io', u'devic', u'download', u'app']
[u'phone', u'run', u'latest', u'io', u'may', u'account', u'problem', u'day', u'..', u'time', u'replac']
[u'sure', u'start', u'public', u'io', u"'ll", u'live', u'help', u'ask', u'anyth', u'session', u'today', u'friday']
[u'two', u'dollar', u'tuesday', u'forklift', u'2', u'quickkey', u'io', u'suit', u'page', u'1.99', u'today']
[u"'re", u'alreadi', u'sign', u'test', u'io', u'game', u'now', u'chanc']
[u'youtub', u'game', u'offici', u'launch', u'web', u'android', u'io', u'august', u'26', u'youtub', u'final', u'go', u'r']
[u'youtub', u'game', u'launch', u'tomorrow', u'io', u'android', u'app', u'go', u'head-to-head', u'twitch']
[u'yashan', u'bbc', u'vpn', u'access', u'io', u'may', u'limit', u'china', u"'s", u'militari', u'parad', u'chanc', u'chat']
[u'parkrun', u'app', u'io', u'download', u'great', u'app', u'easier', u'access'

#### Zliczanie występujących wyrażeń ( popularnych słów )

In [15]:
words = collections.Counter()
for i in twt_train.index:
    twt = BeforeTokenizationNormalizer.normalize(twt_train.iat[i])
    words.update(TokenizeTwt.tokenize(twt)) 

len(words)

8820

#### Kod tworzący reprezentację bag-of-words

In [16]:
def create_bow(documents, features):
    row = []
    col = []
    data = []
    labels = []
    for i in documents.index:
        tweet = BeforeTokenizationNormalizer.normalize(documents.iat[i, 2])
        label = documents.iat[i, 1]
        tweet_tokens = TokenizeTwt.tokenize(tweet)

        labels.append(label)
        for token in set(tweet_tokens):
            if token not in features:
                continue
            row.append(i)
            col.append(features[token])
            data.append(1)
    return csr_matrix((data, (row, col)), shape=(len(documents), len(features))), labels

def create_bow_test(documents, features):
    row = []
    col = []
    data = []
    labels = []
    for i in range(len(documents)):
        tweet = BeforeTokenizationNormalizer.normalize(documents.iat[i, 1])

        tweet_tokens = TokenizeTwt.tokenize(tweet)
        
        
        for token in set(tweet_tokens):
            if token not in features:
                continue
            row.append(i)
            col.append(features[token])
            data.append(1)
    return csr_matrix((data, (row, col)), shape=(len(documents), len(features)))

#### Klasyfikacja

In [17]:
from scipy.sparse import csr_matrix
from sklearn.ensemble import RandomForestClassifier

min_word_count = 3

train_tweets = pd.read_csv("train.csv", sep=",", header=0,dtype = str)
test_tweets = pd.read_csv("test.csv", sep=",", header=0,dtype = str).dropna()


common_words = list([k for k, v in words.most_common() if v > min_word_count])


feature_dict = {}
for word in common_words:
    feature_dict[word] = len(feature_dict)

print("Klasyfikacja")
X_train, y_train = create_bow(train_tweets, feature_dict)
list_of_labels = list(set(y_train))
classifier = RandomForestClassifier(n_estimators=500, n_jobs=-1, random_state=129738)
classifier.fit(X_train, y_train)

print("Testowanie")
X_test = create_bow_test(test_tweets, feature_dict)
predicted = classifier.predict(X_test)

df2 = pd.DataFrame({ 'Id' : test_tweets["Id"],
                      'Category' : predicted,      
                    })

sequence = ['Id','Category']
df2 = df2.reindex(columns=sequence)

print(df2.head(5))
df2.to_csv(path_or_buf="wynik.csv", sep=',', header=True, index = False)
print(len(predicted))

accuracy_score(train_tweets.iloc[0:4000,1], predicted)

Klasyfikacja
Testowanie
                   Id  Category
0  628949369883000832  positive
1  628976607420645377   neutral
2  629023169169518592  positive
3  629179223232479232  positive
4  629186282179153920   neutral
4000


0.442