In [165]:
from sklearn.feature_extraction.text import CountVectorizer
from emot.emo_unicode import EMOTICONS_EMO, UNICODE_EMOJI
from sklearn.model_selection import train_test_split
from nltk.stem import WordNetLemmatizer
from spellchecker import SpellChecker
from sklearn.decomposition import PCA
import matplotlib.ticker as ticker
from nltk.corpus import stopwords
from collections import Counter
import matplotlib.pyplot as plt
import seaborn as sb
import pandas as pd
import numpy as np
import sklearn
import string
import nltk
import re

In [166]:
text_df = pd.read_csv("../data/data_full.csv")
text_df.head()

Unnamed: 0,tweet_id,author_id,inbound,created_at,text,response_tweet_id,in_response_to_tweet_id
0,1,sprintcare,False,Tue Oct 31 22:10:47 +0000 2017,@115712 I understand. I would like to assist y...,2.0,3.0
1,2,115712,True,Tue Oct 31 22:11:45 +0000 2017,@sprintcare and how do you propose we do that,,1.0
2,3,115712,True,Tue Oct 31 22:08:27 +0000 2017,@sprintcare I have sent several private messag...,1.0,4.0
3,4,sprintcare,False,Tue Oct 31 21:54:49 +0000 2017,@115712 Please send us a Private Message so th...,3.0,5.0
4,5,115712,True,Tue Oct 31 21:49:35 +0000 2017,@sprintcare I did.,4.0,6.0


In [167]:
text_df = text_df.sample(frac=0.01) # Убрать на полном датасете
text_df = text_df[["text"]]
text_df["text"] = text_df["text"].astype(str)

In [168]:
text_df.dropna(inplace=True)
text_df.isna().sum()

text    0
dtype: int64

In [169]:
text_df.head(10)

Unnamed: 0,text
1240028,@AppleSupport Don’t even want to talk about it...
2245905,@617197 No problem :) ^HP
1420724,@482256 You will need to contact the Post Offi...
1540406,Another wonderful flight @VirginAmerica. I’m r...
478106,"@245618 Hi there, can you please direct messag..."
941529,@366978 @nationalrailenq I can appreciate that...
1349981,@Uber_Support set example of worst services
1474279,@496485 We appreciate your feedback. I’ll pass...
2314056,@GWRHelp OK thanks for your help.
976960,"@375648 @NikeSupport same, did you ever fix it?"


In [170]:
text_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 28118 entries, 1240028 to 36121
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    28118 non-null  object
dtypes: object(1)
memory usage: 439.3+ KB


In [171]:
text_df["text"] = text_df["text"].str.lower().str.strip()
text_df.head()

Unnamed: 0,text
1240028,@applesupport don’t even want to talk about it...
2245905,@617197 no problem :) ^hp
1420724,@482256 you will need to contact the post offi...
1540406,another wonderful flight @virginamerica. i’m r...
478106,"@245618 hi there, can you please direct messag..."


In [172]:
AUTHOR_PATTERN = re.compile(r"^@\S+ ")

def remove_author(text):
    return re.sub(AUTHOR_PATTERN, "", text)

text_df["text"] = text_df["text"].apply(remove_author)
text_df.head()

Unnamed: 0,text
1240028,don’t even want to talk about it... you know t...
2245905,no problem :) ^hp
1420724,you will need to contact the post office (usps...
1540406,another wonderful flight @virginamerica. i’m r...
478106,"hi there, can you please direct message your f..."


In [173]:
HTTP_PATTERN = re.compile(r"https?://\S+|www\.\S+ ")

def remove_http(text):
    return re.sub(HTTP_PATTERN, "", text)

text_df["text"] = text_df["text"].apply(remove_http).str.strip()
text_df.head()

Unnamed: 0,text
1240028,don’t even want to talk about it... you know t...
2245905,no problem :) ^hp
1420724,you will need to contact the post office (usps...
1540406,another wonderful flight @virginamerica. i’m r...
478106,"hi there, can you please direct message your f..."


In [174]:
HTML_PATTERN = re.compile('<.*?>')

def remove_html(text):
    return re.sub(HTML_PATTERN, "", text)

text_df["text"] = text_df["text"].apply(remove_html)
text_df.head()

Unnamed: 0,text
1240028,don’t even want to talk about it... you know t...
2245905,no problem :) ^hp
1420724,you will need to contact the post office (usps...
1540406,another wonderful flight @virginamerica. i’m r...
478106,"hi there, can you please direct message your f..."


In [175]:
text_df["text"] = text_df["text"].str.replace("\n", " ")
text_df.head()

Unnamed: 0,text
1240028,don’t even want to talk about it... you know t...
2245905,no problem :) ^hp
1420724,you will need to contact the post office (usps...
1540406,another wonderful flight @virginamerica. i’m r...
478106,"hi there, can you please direct message your f..."


In [176]:
text_df["text"] = text_df["text"].str.replace("’", "'")
APOSTROPHES = {
    "'m":    "am",
    "'re":   "are",
    "'ve":   "have",
    "'ll":   "will",
    "'d":    "would",
    "won't": "will not",
    "can't": "cannot",
    "n't":   "not",
}

def convert_apostrophes(text):
    for apostrophe, value in APOSTROPHES.items():
        text = re.sub(u'('+apostrophe+')', f' {value} ', text)
    return text

text_df["text"] = text_df["text"].apply(convert_apostrophes)
text_df.head()

Unnamed: 0,text
1240028,do not even want to talk about it... you know...
2245905,no problem :) ^hp
1420724,you will need to contact the post office (usps...
1540406,another wonderful flight @virginamerica. i am ...
478106,"hi there, can you please direct message your f..."


In [177]:
ABBREVIATIONS = {}

with open("../data/abbr.txt", 'r') as f:
    for line in f.readlines():
        line = line.lower().strip().replace(",", "")
        k, v = line.split("=")
        ABBREVIATIONS.update({k: v})

def convert_abbreviations(text):
    return " ".join([
        ABBREVIATIONS[w] if w in ABBREVIATIONS else w
        for w in text.split()
    ])

text_df["text"] = text_df["text"].apply(convert_abbreviations)
text_df.head()

Unnamed: 0,text
1240028,do not even want to talk about it... you know ...
2245905,no problem :) ^hp
1420724,you will need to contact the post office (usps...
1540406,another wonderful flight @virginamerica. i am ...
478106,"hi there, can you please direct message your f..."


In [178]:
NUMBER_PATTERN = re.compile(r"\d*")

def remove_numbers(text):
    return re.sub(NUMBER_PATTERN, "", text)

text_df["text"] = text_df["text"].apply(remove_numbers)
text_df.head()

Unnamed: 0,text
1240028,do not even want to talk about it... you know ...
2245905,no problem :) ^hp
1420724,you will need to contact the post office (usps...
1540406,another wonderful flight @virginamerica. i am ...
478106,"hi there, can you please direct message your f..."


In [179]:
PUNCTUATION_TO_REMOVE = string.punctuation

def remove_punctuation(text):
    return text.translate(str.maketrans('', '', PUNCTUATION_TO_REMOVE))

text_df["text"] = text_df["text"].apply(remove_punctuation)
text_df.head()

Unnamed: 0,text
1240028,do not even want to talk about it you know the...
2245905,no problem hp
1420724,you will need to contact the post office usps ...
1540406,another wonderful flight virginamerica i am re...
478106,hi there can you please direct message your fu...


In [180]:
EMOT_EMOJI = {
    emoji: value.replace(":", "").replace(",", "").replace(" ", "_").lower()
    for emoji, value in {**EMOTICONS_EMO, **UNICODE_EMOJI}.items()
}

def convert_emot_emoji(text):
    for emoji, value in EMOT_EMOJI.items():
        text = re.sub(u'('+re.escape(emoji)+')', f" {value}", text)
    return text

text_df["text"] = text_df["text"].apply(convert_emot_emoji)
text_df.head()

Unnamed: 0,text
1240028,do not even want to talk about it you know the...
2245905,no problem hp
1420724,you will need to contact the post office usps ...
1540406,another wonderful flight virginamerica i am re...
478106,hi there can you please direct message your fu...


In [181]:
ENG_PATTERN = re.compile(r"^[a-z_ ]*$")

def extract_eng(text):
    if not re.match(ENG_PATTERN, text):
        print(text)
    return not re.match(ENG_PATTERN, text)

text_df["text"].copy().apply(extract_eng).sum()

1783

In [182]:
STOPWORDS = stopwords.words("english")

def remove_stopwords(text):
    return " ".join(w for w in nltk.word_tokenize(text) if not w in STOPWORDS)

text_df["text"] = text_df["text"].apply(remove_stopwords)
text_df.head()

Unnamed: 0,text
1240028,even want talk know problems
2245905,problem hp
1420724,need contact post office usps contact kd
1540406,another wonderful flight virginamerica really ...
478106,hi please direct message full name booking ref...


In [183]:
spell = SpellChecker()

def correct_spellings(text):
    corrected_text = []
    misspelled_words = spell.unknown(text.split())
    for w in text.split():
        if w in misspelled_words:
            corrected_text.append(spell.correction(w))
        else:
            corrected_text.append(w)
    return " ".join(corrected_text)

text_df["text"] = text_df["text"].apply(correct_spellings)
text_df.head()


KeyboardInterrupt



In [None]:
# Для текущего датасета предпочтительно использовать лемматизацию в виду
# большого количества сложных слов, которые требуют более тщательной обработки,
# что отражается на скорости обработки данных

lemmatizer = WordNetLemmatizer()

def lemmatize_words(text):
    return " ".join([lemmatizer.lemmatize(w) for w in text.split()])

text_df["text"] = text_df["text"].apply(lemmatize_words)
text_df.head()

In [None]:
# Часто встречающиеся слова в датасете
cnt = Counter()
for text in text_df["text"].values:
    for word in text.split():
        cnt[word] += 1

most_common = cnt.most_common(10)
most_common

In [None]:
FREQWORDS = set([w for (w, wc) in most_common])

def remove_freqwords(text):
    return " ".join([w for w in str(text).split() if w not in FREQWORDS])

text_df["text"] = text_df["text"].apply(remove_freqwords)
text_df.head()

In [None]:
# Редко встречающиеся слова в датасете
num_rare_words = 20
cnt.most_common()[:-num_rare_words-1:-1] # Позже улучшить

In [None]:
RAREWORDS = set([w for (w, wc) in cnt.most_common()[:-num_rare_words-1:-1]])

def remove_rarewords(text):
    return " ".join([w for w in str(text).split() if w not in RAREWORDS])

text_df["text"] = text_df["text"].apply(remove_rarewords)
text_df.head()

In [None]:
# Разделение на тестовую и обучающую выборки
train_data, test_data = train_test_split(text_df["text"], train_size=0.8)

In [None]:
# Векторизация текста на основе частоты каждого слова
count_vect = CountVectorizer(analyzer='word', max_features=500, min_df=5, max_df=0.8)
count_terms= count_vect.fit_transform(train_data)
count_tokens = count_vect.get_feature_names_out()
count_terms_vect = count_vect.transform(test_data)

df_count_vect = pd.DataFrame(data=count_terms_vect.todense(), columns=count_tokens)
df_count_vect

In [None]:
df_count_vect_sum = df_count_vect.sum()
print(f'Наиболее часто встречающееся слово в датасете: {df_count_vect_sum.idxmax()} - {df_count_vect_sum.max()}')

In [None]:
df_count_vect.describe()

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(20, 5))
ax.set_xlabel('Слова')
ax.set_ylabel('Частота')
ax.set_title("Векторизация по частоте слов")
ax.xaxis.set_major_locator(ticker.MultipleLocator(df_count_vect.shape[1] / 100))
plt.xticks(rotation = 90)

plt.plot(df_count_vect_sum.sort_values(ascending=False))
plt.show()

In [None]:
# Метод главных компонентов (сжатая корреляция)
pca = PCA(n_components=100)
corr_pca = pca.fit_transform(df_count_vect)
corr_pca

In [None]:
_, ax = plt.subplots(1, 1, figsize=(20, 5))
ax.set_xlabel('Компоненты')
ax.set_ylabel('Значения')
ax.set_title("Метод главных компонентов")

plt.plot(corr_pca)
plt.show()