In [1]:
import pandas as pd
import numpy as np
import re
import unicodedata
from tqdm import tqdm
from langdetect import detect

import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

from sklearn.feature_extraction import text
from sklearn.pipeline import FeatureUnion,Pipeline
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import LabelEncoder

from sklearn.model_selection import GridSearchCV
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report, accuracy_score

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\mxyar\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\mxyar\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\mxyar\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


## Сlass, that converts the original tweets

In [2]:
class FreeTextConverter:
    def __init__(self, regex_ignore=[], emoticons_ignore=False):
        self.regex_dict = {
            'URL': r"""(?xi)\b(?:(?:https?|ftp|file):\/\/|www\.|ftp\.|pic\.|twitter\.|facebook\.)(?:\([-A-Z0-9+&@#\/%=~_|$?!:;,.]*\)|[-A-Z0-9+&@#\/%=~_|$?!:;,.])*(?:\([-A-Z0-9+&@#\/%=~_|$?!:,.]*\)|[A-Z0-9+&@#\/%=~_|$])""",
            'EMOJI': u'([\U0001F1E0-\U0001F1FF])|([\U0001F300-\U0001F5FF])|([\U0001F600-\U0001F64F])|([\U0001F680-\U0001F6FF])|([\U0001F700-\U0001F77F])|([\U0001F800-\U0001F8FF])|([\U0001F900-\U0001F9FF])|([\U0001FA00-\U0001FA6F])|([\U0001FA70-\U0001FAFF])|([\U00002702-\U000027B0])|([\U00002600-\U000027BF])|([\U0001f300-\U0001f64F])|([\U0001f680-\U0001f6FF])',
            'HASHTAG': r"\#\b[\w\-\_]+\b",
            'EMAIL': r"(?:^|(?<=[^\w@.)]))(?:[\w+-](?:\.(?!\.))?)*?[\w+-]@(?:\w-?)*?\w+(?:\.(?:[a-z]{2,})){1,3}(?:$|(?=\b))",
            'MENTION': r"@[A-Za-z0-9]+",
            'CASHTAG': r"(?:[$\u20ac\u00a3\u00a2]\d+(?:[\\.,']\d+)?(?:[MmKkBb](?:n|(?:il(?:lion)?))?)?)|(?:\d+(?:[\\.,']\\d+)?[$\u20ac\u00a3\u00a2])",
            'DATE': r"(?:(?:(?:(?:(?<!:)\b\'?\d{1,4},? ?)?\b(?:[Jj]an(?:uary)?|[Ff]eb(?:ruary)?|[Mm]ar(?:ch)?|[Aa]pr(?:il)?|May|[Jj]un(?:e)?|[Jj]ul(?:y)?|[Aa]ug(?:ust)?|[Ss]ept?(?:ember)?|[Oo]ct(?:ober)?|[Nn]ov(?:ember)?|[Dd]ec(?:ember)?)\b(?:(?:,? ?\'?)?\d{1,4}(?:st|nd|rd|n?th)?\b(?:[,\\/]? ?\'?\d{2,4}[a-zA-Z]*)?(?: ?- ?\d{2,4}[a-zA-Z]*)?(?!:\d{1,4})\b))|(?:(?:(?<!:)\b\\'?\d{1,4},? ?)\b(?:[Jj]an(?:uary)?|[Ff]eb(?:ruary)?|[Mm]ar(?:ch)?|[Aa]pr(?:il)?|May|[Jj]un(?:e)?|[Jj]ul(?:y)?|[Aa]ug(?:ust)?|[Ss]ept?(?:ember)?|[Oo]ct(?:ober)?|[Nn]ov(?:ember)?|[Dd]ec(?:ember)?)\b(?:(?:,? ?\'?)?\d{1,4}(?:st|nd|rd|n?th)?\b(?:[,\\/]? ?\'?\d{2,4}[a-zA-Z]*)?(?: ?- ?\d{2,4}[a-zA-Z]*)?(?!:\d{1,4})\b)?))|(?:\b(?<!\d\\.)(?:(?:(?:[0123]?[0-9][\\.\\-\\/])?[0123]?[0-9][\\.\\-\\/][12][0-9]{3})|(?:[0123]?[0-9][\\.\\-\\/][0123]?[0-9][\\.\\-\\/][12]?[0-9]{2,3}))(?!\.\d)\b))",
            'TIME': r'(?:(?:\d+)?\\.?\d+(?:AM|PM|am|pm|a\\.m\\.|p\\.m\\.))|(?:(?:[0-2]?[0-9]|[2][0-3]):(?:[0-5][0-9])(?::(?:[0-5][0-9]))?(?: ?(?:AM|PM|am|pm|a\\.m\\.|p\\.m\\.))?)',
            'EMPHASIS': r"(?:\*\b\w+\b\*)",
            'ELONG': r"\b[A-Za-z]*([a-zA-Z])\1\1[A-Za-z]*\b"}
        
        for key in regex_ignore:
            if key in self.regex_dict:
                del self.regex_dict[key]

        self.contraction_mapping = {"’": "'", "RT ": " ", "ain't": "is not", "aren't": "are not", "can't": "can not",
                                    "'cause": "because", "could've": "could have",
                                    "couldn't": "could not", "didn't": "did not", "doesn't": "does not",
                                    "don't": "do not", "hadn't": "had not",
                                    "hasn't": "has not", "haven't": "have not", "he'd": "he would", "he'll": "he will",
                                    "he's": "he is",
                                    "how'd": "how did", "how'd'y": "how do you", "how'll": "how will",
                                    "how's": "how is", "I'd": "I would",
                                    "I'd've": "I would have", "I'll": "I will", "I'll've": "I will have", "I'm": "I am",
                                    "I've": "I have",
                                    "i'd": "i would", "i'd've": "i would have", "i'll": "i will",
                                    "i'll've": "i will have", "i'm": "i am",
                                    "i've": "i have", "isn't": "is not", "it'd": "it would", "it'd've": "it would have",
                                    "it'll": "it will",
                                    "it'll've": "it will have", "it's": "it is", "it’s": "it is", "let's": "let us",
                                    "ma'am": "madam", "mayn't": "may not",
                                    "might've": "might have", "mightn't": "might not", "mightn't've": "might not have",
                                    "must've": "must have",
                                    "mustn't": "must not", "mustn't've": "must not have", "needn't": "need not",
                                    "needn't've": "need not have",
                                    "o'clock": "of the clock", "oughtn't": "ought not", "oughtn't've": "ought not have",
                                    "shan't": "shall not",
                                    "sha'n't": "shall not", "shan't've": "shall not have", "she'd": "she would",
                                    "she'd've": "she would have",
                                    "she'll": "she will", "she'll've": "she will have", "she's": "she is",
                                    "should've": "should have",
                                    "shouldn't": "should not", "shouldn't've": "should not have", "so've": "so have",
                                    "so's": "so as",
                                    "this's": "this is", "that'd": "that would", "that'd've": "that would have",
                                    "that's": "that is",
                                    "there'd": "there would", "there'd've": "there would have", "there's": "there is",
                                    "here's": "here is",
                                    "they'd": "they would", "they'd've": "they would have", "they'll": "they will",
                                    "they'll've": "they will have",
                                    "they're": "they are", "they've": "they have", "to've": "to have",
                                    "wasn't": "was not", "we'd": "we would",
                                    "we'd've": "we would have", "we'll": "we will", "we'll've": "we will have",
                                    "we're": "we are", "we've": "we have",
                                    "weren't": "were not", "what'll": "what will", "what'll've": "what will have",
                                    "what're": "what are",
                                    "what's": "what is", "what've": "what have", "when's": "when is",
                                    "when've": "when have", "where'd": "where did",
                                    "where's": "where is", "where've": "where have", "who'll": "who will",
                                    "who'll've": "who will have",
                                    "who's": "who is", "who've": "who have", "why's": "why is", "why've": "why have",
                                    "will've": "will have",
                                    "won't": "will not", "won't've": "will not have", "would've": "would have",
                                    "wouldn't": "would not",
                                    "wouldn't've": "would not have", "y'all": "you all", "y'all'd": "you all would",
                                    "y'all'd've": "you all would have",
                                    "y'all're": "you all are", "y'all've": "you all have", "you'd": "you would",
                                    "you'd've": "you would have",
                                    "you'll": "you will", "you'll've": "you will have", "you're": "you are",
                                    "you've": "you have", "It's": "It is", "You'd": "You would",
                                    ' u ': " you ", 'yrs': 'years', 'FYI': 'For your information', ' im ': ' I am ',
                                    'lol': 'LOL', 'You\'re': 'You are'
            , 'can’t': 'can not', '…': '. ', '...': '. ', '\'\'': '\'', '≠': '', 'ain’t': 'am not', 'I’m': 'I am',
                                    'RT\'s': ''}
        self.emoticons = {
            ':*': '<kiss>',
            ':-*': '<kiss>',
            ':x': '<kiss>',
            ':-)': '<happy>',
            ':-))': '<happy>',
            ':-)))': '<happy>',
            ':-))))': '<happy>',
            ':-)))))': '<happy>',
            ':-))))))': '<happy>',
            ':)': '<happy>',
            ':))': '<happy>',
            ':)))': '<happy>',
            ':))))': '<happy>',
            ':)))))': '<happy>',
            ':))))))': '<happy>',
            ':)))))))': '<happy>',
            ':o)': '<happy>',
            ':]': '<happy>',
            ':3': '<happy>',
            ':c)': '<happy>',
            ':>': '<happy>',
            '=]': '<happy>',
            '8)': '<happy>',
            '=)': '<happy>',
            ':}': '<happy>',
            ':^)': '<happy>',
            '|;-)': '<happy>',
            ":'-)": '<happy>',
            ":')": '<happy>',
            '\o/': '<happy>',
            '*\\0/*': '<happy>',
            ':-D': '<laugh>',
            ':D': '<laugh>',
            '8-D': '<laugh>',
            '8D': '<laugh>',
            'x-D': '<laugh>',
            'xD': '<laugh>',
            'X-D': '<laugh>',
            'XD': '<laugh>',
            '=-D': '<laugh>',
            '=D': '<laugh>',
            '=-3': '<laugh>',
            '=3': '<laugh>',
            'B^D': '<laugh>',
            '>:[': '<sad>',
            ':-(': '<sad>',
            ':-((': '<sad>',
            ':-(((': '<sad>',
            ':-((((': '<sad>',
            ':-(((((': '<sad>',
            ':-((((((': '<sad>',
            ':-(((((((': '<sad>',
            ':(': '<sad>',
            ':((': '<sad>',
            ':(((': '<sad>',
            ':((((': '<sad>',
            ':(((((': '<sad>',
            ':((((((': '<sad>',
            ':(((((((': '<sad>',
            ':((((((((': '<sad>',
            ':-c': '<sad>',
            ':c': '<sad>',
            ':-<': '<sad>',
            ':<': '<sad>',
            ':-[': '<sad>',
            ':[': '<sad>',
            ':{': '<sad>',
            ':-||': '<sad>',
            ':@': '<sad>',
            ":'-(": '<sad>',
            ":'(": '<sad>',
            'D:<': '<sad>',
            'D:': '<sad>',
            'D8': '<sad>',
            'D;': '<sad>',
            'D=': '<sad>',
            'DX': '<sad>',
            'v.v': '<sad>',
            "D-':": '<sad>',
            '(>_<)': '<sad>',
            ':|': '<sad>',
            '>:O': '<surprise>',
            ':-O': '<surprise>',
            ':-o': '<surprise>',
            ':O': '<surprise>',
            '°o°': '<surprise>',
            'o_O': '<surprise>',
            'o_0': '<surprise>',
            'o.O': '<surprise>',
            'o-o': '<surprise>',
            '8-0': '<surprise>',
            '|-O': '<surprise>',
            ';-)': '<wink>',
            ';)': '<wink>',
            '*-)': '<wink>',
            '*)': '<wink>',
            ';-]': '<wink>',
            ';]': '<wink>',
            ';D': '<wink>',
            ';^)': '<wink>',
            ':-,': '<wink>',
            '>:P': '<tong>',
            ':-P': '<tong>',
            ':P': '<tong>',
            'X-P': '<tong>',
            'x-p': '<tong>',
            ':-p': '<tong>',
            ':p': '<tong>',
            '=p': '<tong>',
            ':-Þ': '<tong>',
            ':Þ': '<tong>',
            ':-b': '<tong>',
            ':b': '<tong>',
            ':-&': '<tong>',
            '>:\\': '<annoyed>',
            '>:/': '<annoyed>',
            ':-/': '<annoyed>',
            ':-.': '<annoyed>',
            ':/': '<annoyed>',
            ':\\': '<annoyed>',
            '=/': '<annoyed>',
            '=\\': '<annoyed>',
            ':L': '<annoyed>',
            '=L': '<annoyed>',
            ':S': '<annoyed>',
            '>.<': '<annoyed>',
            ':-|': '<annoyed>',
            '<:-|': '<annoyed>',
            ':-X': '<seallips>',
            ':X': '<seallips>',
            ':-#': '<seallips>',
            ':#': '<seallips>',
            'O:-)': '<angel>',
            '0:-3': '<angel>',
            '0:3': '<angel>',
            '0:-)': '<angel>',
            '0:)': '<angel>',
            '0;^)': '<angel>',
            '>:)': '<devil>',
            '>:D': '<devil>',
            '>:-D': '<devil>',
            '>;)': '<devil>',
            '>:-)': '<devil>',
            '}:-)': '<devil>',
            '}:)': '<devil>',
            '3:-)': '<devil>',
            '3:)': '<devil>',
            'o/\o': '<highfive>',
            '^5': '<highfive>',
            '>_>^': '<highfive>',
            '^<_<': '<highfive>',
            '<3': '<heart>',
            '^3^': '<smile>',
            "(':": '<smile>',
            " > < ": '<smile>',
            "UvU": '<smile>',
            "uwu": '<smile>',
            'UwU': '<smile>'
        }
        
        if emoticons_ignore:
            for emotion_key in self.emoticons:
                self.emoticons[emotion_key] = '<emoticon>'
        
        self.regex = self.get_compiled()

    def get_compiled(self):
        regexes = {k: re.compile(self.regex_dict[k]) for k, v in
                   self.regex_dict.items()}
        return regexes

    def fit(self, Example):
        for key, reg in self.regex.items():
            Example = self.regex[key].sub(lambda m: " <" + key + "> ",
                                     Example)
        for word in self.emoticons.keys():
            Example = Example.replace(word, self.emoticons[word])
        Example = Example.lower()
        for word in self.contraction_mapping.keys():
            Example = Example.replace(word, self.contraction_mapping[word])
        Example = re.sub(r"[\-\"`@#$%^&*(|)/~\[\]{\}:;+,._='!?]+", " ", Example)
        Example = unicodedata.normalize('NFKD', Example).encode('ascii', errors='ignore').decode('utf8',
                                                                                                 errors='ignore')
        Example = re.sub(r'\b([b-hB-Hj-zJ-Z] )', ' ', Example)
        Example = re.sub(r'( [b-hB-Hj-zJ-Z])\b', ' ', Example)
        Example = ' '.join(Example.split())

        return Example

In [3]:
word_vectorizer = text.TfidfVectorizer(
          analyzer='word', ngram_range=(1, 3),
          min_df=2, use_idf=True, sublinear_tf=True)
char_vectorizer = text.TfidfVectorizer(
          analyzer='char', ngram_range=(3, 5),
          min_df=2, use_idf=True, sublinear_tf=True)
ngrams_vectorizer = Pipeline([('feats', FeatureUnion([('word_ngram', word_vectorizer),
                ('char_ngram', char_vectorizer),
                ])),])

In [4]:
free_text_converter = FreeTextConverter()

In [5]:
train_df = pd.read_csv("data/Train.csv")
test_df = pd.read_csv("data/Test.csv")

In [6]:
train_df.head()

Unnamed: 0,Topic,Sentiment,TweetDate,TweetText
0,apple,positive,Tue Oct 18 21:53:25 +0000 2011,Now all @Apple has to do is get swype on the i...
1,apple,positive,Tue Oct 18 21:09:33 +0000 2011,@Apple will be adding more carrier support to ...
2,apple,positive,Tue Oct 18 21:02:20 +0000 2011,Hilarious @youtube video - guy does a duet wit...
3,apple,positive,Tue Oct 18 20:40:10 +0000 2011,@RIM you made it too easy for me to switch to ...
4,apple,positive,Tue Oct 18 20:34:00 +0000 2011,I just realized that the reason I got into twi...


## Сlassify it into 4 classes (+ irrelevant)

In [7]:
class TweetClassifier:
    DETECT_LANGUAGES = {
        'en': 'english',
        'ru': 'russian',
        'az': 'azerbaijani',
        'da': 'danish',
        'de': 'german',
        'el': 'greek',
        'fr': 'french',
        'fi': 'finnish',
        'it': 'italian',
        'ro': 'romanian',
        'sl': 'slovene',
        'es': 'spanish',
        'sv': 'swedish',
        'tr': 'turkish',
        'hu': 'hungarian'
    }
    
    def __init__(self):
        self.clf = LinearSVC()
        self.free_text_converter = FreeTextConverter()
        self.tweet_vectorizer = TweetTokenizer()
        self.lexicon_dict = self._fill_lexicon()
        
    def _fill_lexicon(self):
        lexicon_dict = {}
        with open("assets/lexicon.txt", "r", encoding="utf-8") as file:
            for line in file:
                key, value = line.split("\t")
                lexicon_dict[key] = value

        return lexicon_dict
    
    def _preprocess_tweet(self, tweet, ignore_short_tweets=False):
        converted_tweet = self.free_text_converter.fit(tweet)
        tweet_tokens = self.tweet_vectorizer.tokenize(converted_tweet)
        
#         try:
#             lang = detect(tweet)
#             lang = self.DETECT_LANGUAGES.get(lang, 'english')
#         except:
#             lang = 'english'
        
#         stop_words = set(stopwords.words(lang))
#         tweet_tokens = [word for word in tweet_tokens if word not in stop_words]

        tweet_tokens = [self.lexicon_dict[word] if word in self.lexicon_dict else word for word in tweet_tokens]

        if ignore_short_tweets and len(tweet_tokens) < 3:
            return 'None'

        return " ".join(tweet_tokens)
    
    def _preprocess_train_tweet(self, tweet):
        return self._preprocess_tweet(tweet, ignore_short_tweets=True)
    
    def _preprocess_test_tweet(self, tweet):
        return self._preprocess_tweet(tweet)
        
    def fit(self, train_dataset_path, test_dataset_path=None):
        # read datasets
        train_df = pd.read_csv(train_dataset_path)
        
        # filter irrelevant tweets
        train_df = train_df[train_df['Sentiment'] != 'irrelevant']
        
        # convert tweets
        train_df['ConvertedTweet'] = train_df['TweetText'].apply(self._preprocess_train_tweet)
        train_df['ConvertedTweet'] = train_df[train_df['ConvertedTweet'] != None]
        
        self.encoder=LabelEncoder()
        self.encoder.fit(train_df['Sentiment'])
        y_train = self.encoder.transform(train_df['Sentiment'])
        
        # vectorize
        if test_dataset_path:
            test_df = pd.read_csv(test_dataset_path)
            test_df = test_df[test_df['Sentiment'] != 'irrelevant']
            test_df['ConvertedTweet'] = test_df['TweetText'].apply(self.free_text_converter.fit)
            self.vectorizer = ngrams_vectorizer.fit(pd.concat([train_df['ConvertedTweet'], test_df['ConvertedTweet']]))
        else:
            self.vectorizer = ngrams_vectorizer.fit(train_df['ConvertedTweet'])
        x_train = self.vectorizer.transform(train_df['ConvertedTweet'])
        
        self.clf.fit(x_train, y_train)
        
    def predict(self, test_dataset_path, is_print_results=False):
        test_df = pd.read_csv(test_dataset_path)
        test_df = test_df[test_df['Sentiment'] != 'irrelevant']
        test_df['ConvertedTweet'] = test_df['TweetText'].apply(self.free_text_converter.fit)
        x_test = self.vectorizer.transform(test_df['ConvertedTweet'])
        
        y_pred = clf.predict(x_test)
        
        if is_print_results:
            y_test = self.encoder.transform(test_df['Sentiment'])
            print(classification_report(y_test, y_pred))
        
        return self.encoder.inverse_transform(y_pred)  
    
    def predict_tweets(self, tweets):
        test_df = pd.DataFrame(tweets, columns=['TweetText'])
        test_df['ConvertedTweet'] = test_df['TweetText'].apply(self.free_text_converter.fit)
        x_test = self.vectorizer.transform(test_df['ConvertedTweet'])
        y_pred = clf.predict(x_test)
        return self.encoder.inverse_transform(y_pred)
    
    def _run(self, train_df, test_df, is_print_results=False):
        train_df['ConvertedTweet'] = train_df['TweetText'].apply(self._preprocess_train_tweet)
        train_df = train_df[train_df['ConvertedTweet'] != 'None']
        test_df['ConvertedTweet'] = test_df['TweetText'].apply(self._preprocess_test_tweet)
        
        self.encoder=LabelEncoder()
        self.encoder.fit(train_df['Sentiment'])
        y_train = self.encoder.transform(train_df['Sentiment'])

        self.vectorizer = ngrams_vectorizer.fit(train_df['ConvertedTweet'].values)
        x_train = self.vectorizer.transform(train_df['ConvertedTweet'].values)
        x_test = self.vectorizer.transform(test_df['ConvertedTweet'].values)
        
        self.clf.fit(x_train, y_train)
        y_pred = self.clf.predict(x_test)
        
        if is_print_results:
            y_test = self.encoder.transform(test_df['Sentiment'])
            print(classification_report(y_test, y_pred))
        
        return self.encoder.inverse_transform(y_pred)
    
    def run(self, train_path="data/Train.csv", test_path="data/Test.csv", is_print_results=False):
        train_df = pd.read_csv(train_path)
        test_df = pd.read_csv(test_path)
        train_df = train_df[train_df['Sentiment'] != 'irrelevant']
        test_df = test_df[test_df['Sentiment'] != 'irrelevant']

        return self._run(train_df, test_df, is_print_results)
    
    def run_with_tweets(self, tweets=[]):
        train_df = pd.read_csv("data/Train.csv")
        train_df = train_df[train_df['Sentiment'] != 'irrelevant']

        test_df = pd.DataFrame(tweets, columns=['TweetText'])
        return self._run(train_df, test_df)

In [8]:
tweet_clf = TweetClassifier()
y_pred = tweet_clf.run(train_path="data/Train.csv", test_path="data/Test.csv", is_print_results=True)

              precision    recall  f1-score   support

           0       0.71      0.51      0.60        49
           1       0.81      0.91      0.86       156
           2       0.67      0.56      0.61        32

    accuracy                           0.78       237
   macro avg       0.73      0.66      0.69       237
weighted avg       0.77      0.78      0.77       237



## Classify your own tweets
##### Use pandas dataframe or tweet list

In [9]:
test_path = "data/Test.csv"
y_pred = tweet_clf.run(test_path=test_path, is_print_results=True)

              precision    recall  f1-score   support

           0       0.71      0.51      0.60        49
           1       0.81      0.91      0.86       156
           2       0.67      0.56      0.61        32

    accuracy                           0.78       237
   macro avg       0.73      0.66      0.69       237
weighted avg       0.77      0.78      0.77       237



In [10]:
tweets = [
    'Hey! Today I\'m gonna sign a contract with Apple!',
    'My mom made me Breakfast. I\'m so happy ^^',
    'SALE. -20% https://sales.com',
    '@pepe I hate you!!!!'
]

y_pred = tweet_clf.run_with_tweets(tweets)
print(y_pred)

['neutral' 'positive' 'neutral' 'negative']


# Let's predict organization by a Tweet

In [11]:
# Create all subsets given set
from itertools import combinations
def sub_lists(my_list):
    subs = []
    for i in range(0, len(my_list) + 1):
        temp = [list(x) for x in combinations(my_list, i)]
        if len(temp) > 0:
            subs.extend(temp)
    return subs

In [12]:
def predict_org(regex_ignore, print_result=False):
    # read datasets
    train_df = pd.read_csv("data/Train.csv")
    test_df = pd.read_csv("data/Test.csv")

    free_text_converter = FreeTextConverter(regex_ignore=regex_ignore)

    train_df['ConvertedTweet']=train_df['TweetText'].apply(free_text_converter.fit)
    test_df['ConvertedTweet']=test_df['TweetText'].apply(free_text_converter.fit)

    vectorizer = ngrams_vectorizer.fit(train_df['ConvertedTweet'].values)
    x_train = vectorizer.transform(train_df['ConvertedTweet'].values)
    x_test = vectorizer.transform(test_df['ConvertedTweet'].values)

    encoder=LabelEncoder()
    y_train = encoder.fit_transform(train_df['Topic'])
    y_test = encoder.fit_transform(test_df['Topic'])

    clf = LinearSVC()
    clf.fit(x_train, y_train)
    y_pred = clf.predict(x_test)

    if print_result:
        print(classification_report(y_test, y_pred))
    return y_test, y_pred

Hypothesis: No need to convert hashtags and mentions, when we want to predict organization

In [None]:
pattern_list = list(FreeTextConverter().regex_dict.keys())
scores = {}
for regex_ignore in tqdm(sub_lists(pattern_list)):
    y_test, y_pred = predict_org(regex_ignore=regex_ignore)
    score = accuracy_score(y_test, y_pred)
    scores[','.join(regex_ignore)] = score

best_patterns = ''
best_score = -1
for key, score in scores.items():
    if score > best_score:
        best_score = score
        best_patterns = key
print('Best pattern subset and best accurancy score:')
print(best_patterns, best_score)

In [14]:
y_test, y_pred = predict_org(regex_ignore=['HASHTAG','MENTION'], print_result=True)

              precision    recall  f1-score   support

           0       0.93      0.96      0.94        98
           1       0.85      0.77      0.81        79
           2       0.85      0.74      0.79        78
           3       0.74      0.86      0.80        87

    accuracy                           0.84       342
   macro avg       0.84      0.83      0.84       342
weighted avg       0.85      0.84      0.84       342

