<a href="https://colab.research.google.com/github/DeisyData/BIT_IA_Bootcamp/blob/main/S10_C5_Twitter_NGram_skeleton.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [20]:
import pandas as pd
import matplotlib.pyplot as plt


# NLP
import re # regulary expression
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

from nltk.util import ngrams
from collections import Counter

# Paso 1. Cargar Datos y Limpiar

In [21]:
# Cargar datos
data = pd.read_csv('twitter.csv')

# Mostrar primeras filas
print(data.head())



   id  label                                              tweet
0   1      0   @user when a father is dysfunctional and is s...
1   2      0  @user @user thanks for #lyft credit i can't us...
2   3      0                                bihday your majesty
3   4      0  #model   i love u take with u all the time in ...
4   5      0             factsguide: society now    #motivation


In [22]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31962 entries, 0 to 31961
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      31962 non-null  int64 
 1   label   31962 non-null  int64 
 2   tweet   31962 non-null  object
dtypes: int64(2), object(1)
memory usage: 749.2+ KB


* label 0 = positive
* label 1 = negative

¿Cuál es la distribución de las clases?

In [23]:
data['label'].value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
0,29720
1,2242


In [24]:
data = data.drop(columns=['id'])
# Mostrar primeras filas
print(data.head())

   label                                              tweet
0      0   @user when a father is dysfunctional and is s...
1      0  @user @user thanks for #lyft credit i can't us...
2      0                                bihday your majesty
3      0  #model   i love u take with u all the time in ...
4      0             factsguide: society now    #motivation


In [25]:
# Limpiar texto
def clean_text(text):
      # sub(patron (regular expresion), texto de reemplazo, testo input)
      text = re.sub(r'@[A-Za-z0-9_]+', '', text )# Eliminar menciones @Deisy
      text = re.sub(r'http\S+', '', text )  # Eliminar
      text = re.sub(r'www\S+', '', text )  # Eliminar URLs
      text = re.sub(r'#', '',text)          # Eliminar hashtags
      text = re.sub(r"[^A-Za-z \d\s']",' ',text) # elimina todo lo que no es letras mayus o minus espacios y comilla sencilla"'"
      text = text.lower()              # Convertir a minúsculas lower(), eliminar nueva linea o tabulacion strip()
      return text

data['clean_tweet'] = data['tweet'].apply(clean_text)

In [26]:
print(data['tweet'].head(3))

0     @user when a father is dysfunctional and is s...
1    @user @user thanks for #lyft credit i can't us...
2                                  bihday your majesty
Name: tweet, dtype: object


In [27]:
print(data['clean_tweet'].head(3))

0      when a father is dysfunctional and is so sel...
1      thanks for lyft credit i can't use cause the...
2                                  bihday your majesty
Name: clean_tweet, dtype: object


# Paso 2. Tokenizacion y elminicación de stop words

In [28]:
import nltk
nltk.download('punkt_tab')
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [29]:
# Cargar las stopwords en ingles
stop_words = set(stopwords.words('english'))

# tokens
def tokenize_text(text):
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stop_words]
    return tokens

data['tokens'] = data['clean_tweet'].apply(tokenize_text)

print(data['tokens'].head(10))

0    [father, dysfunctional, selfish, drags, kids, ...
1    [thanks, lyft, credit, ca, n't, use, cause, n'...
2                                    [bihday, majesty]
3                  [model, love, u, take, u, time, ur]
4                    [factsguide, society, motivation]
5    [2, 2, huge, fan, fare, big, talking, leave, c...
6                           [camping, tomorrow, danny]
7    [next, school, year, year, exams, ca, n't, thi...
8    [love, land, allin, cavs, champions, cleveland...
9                               [welcome, 'm, 's, gr8]
Name: tokens, dtype: object


In [30]:
print(data['clean_tweet'].head(10))

0      when a father is dysfunctional and is so sel...
1      thanks for lyft credit i can't use cause the...
2                                  bihday your majesty
3    model   i love u take with u all the time in u...
4                factsguide  society now    motivation
5     2 2  huge fan fare and big talking before the...
6                     camping tomorrow        danny   
7    the next school year is the year for exams    ...
8    we won    love the land    allin cavs champion...
9                 welcome here    i'm   it's so gr8   
Name: clean_tweet, dtype: object


# Paso 3. N-gram

In [31]:
# Crear N-grams y contar su frecuencia
# data['tokens'] = es el conjunto de tokens
# tokens => [welcome, 'm, 's, gr8]
# token/word => welcome
all_word = [word for tokens in data['tokens'] for word in tokens]

In [41]:
unigrams = ngrams(all_word, 1)
bigrams = ngrams(all_word, 2)
trigrams = ngrams(all_word, 3)

In [42]:
fdist_unigrams = nltk.FreqDist(unigrams)
fdist_bigrams = nltk.FreqDist(bigrams)
fdist_trigrams = nltk.FreqDist(trigrams)

In [44]:
fdist_bigrams.most_common(10)

[(('ca', "n't"), 747),
 (("'s", 'day'), 440),
 (('thankful', 'positive'), 429),
 (('father', "'s"), 423),
 (('positive', 'affirmation'), 352),
 (('love', 'u'), 351),
 (('u', 'take'), 327),
 (('take', 'u'), 326),
 (('model', 'love'), 325),
 (('u', 'time'), 325)]

In [46]:
fdist_bigrams.most_common(5)

[(('ca', "n't"), 747),
 (("'s", 'day'), 440),
 (('thankful', 'positive'), 429),
 (('father', "'s"), 423),
 (('positive', 'affirmation'), 352)]

In [47]:
fdist_trigrams.most_common(5)

[(('father', "'s", 'day'), 400),
 (('model', 'love', 'u'), 325),
 (('love', 'u', 'take'), 325),
 (('u', 'take', 'u'), 325),
 (('take', 'u', 'time'), 325)]

In [37]:
# Unigrams, bigrams y trigrams


# Mostrar los 10 más frecuentes


In [38]:
# Visualizar bigrams más frecuentes




```
# This is formatted as code
```

# Paso 4. Colocación (Collocations)

¿ Cómo eliminamos los términos irrelevantes?