In [None]:
from google.colab import drive
drive.mount('/content/drive')
import nltk
nltk.download('stopwords')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
'''
#Example script to generate text from Nietzsche's writings.
At least 20 epochs are required before the generated text
starts sounding coherent.
It is recommended to run this script on GPU, as recurrent
networks are quite computationally intensive.
If you try this script on new data, make sure your corpus
has at least ~100k characters. ~1M is better.
'''

from __future__ import print_function
from keras.callbacks import LambdaCallback
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.optimizers import RMSprop
from keras.utils.data_utils import get_file
import numpy as np
import pandas as pd
import random
import sys
import io

#path = '../data/train_01.txt'

#tweets to text
df = pd.read_csv('/content/drive/MyDrive/cuatrimestres/Noveno cuatri/Natural Language Preprocessing/Project/data.csv', encoding = "ISO-8859-1")


from nltk.corpus import stopwords
from nltk.tokenize import TweetTokenizer
from nltk.stem import PorterStemmer
import re

def scrub_words(text):
    # remove html markup
    text=re.sub("(<.*?>)","",text)
    
    #remove non-ascii and digits
    text=re.sub("(\\W|\\d)"," ",text)
  
    #remove whitespace
    text=text.strip()
    return text
  
#Noise removal, stop word removal, normalizing?
def cleanString(s, special_chars = "\":,.@|ðÿœžðÿâœœïÿœžÿºÿÿœžÿ"):
    for char in special_chars:
        s = s.replace(char, "")
    s = s.replace("\n", "")
    s = scrub_words(s)
    tokenizer = TweetTokenizer()
    stop_words = set(stopwords.words('english'))
    cleaned_words = [w for w in tokenizer.tokenize(s) if w not in stop_words]
    return " ".join(cleaned_words)

def stemWords(sentence):
    stemmer, tokenizer = PorterStemmer(), TweetTokenizer()
    stemmed_words = [stemmer.stem(w) for w in tokenizer.tokenize(sentence)]
    return " ".join(stemmed_words)
    
def cleanFrame(frame):
    frame['clean_tweet'] = frame.Tweet.apply(cleanString)

cleanFrame(df)
df = df['clean_tweet'].iloc[0:4000]
df.to_csv(r'text.txt', header=None, index=None, sep=' ', mode='a')
df.head()

Unnamed: 0,Date,User,Tweet,Lang,Location,Label,clean_tweet
0,2022-11-29 23:59:55+00:00,yunhosdarling,I need paradigm on apple music now ðµâð«...,en,she. 18 â§Ë Â· . was luvjynho,0,I need paradigm apple music µ itunes link let
1,2022-11-29 23:59:55+00:00,luvzxai,ð­hbo go\n\n1 month solo: â±300\n1 month sh...,en,,0,hbo go month solo month shared months shared h...
2,2022-11-29 23:59:42+00:00,idvnno,I still use my Apple Music just so I can liste...,en,yyz,0,I still use Apple Music I listen embarrassing ...
3,2022-11-29 23:59:33+00:00,benjuhmenn,Officially switched from Apple Music to Spotif...,en,"Sin City (Las Vegas, NV)",0,Officially switched Apple Music Spotify idek I...
4,2022-11-29 23:59:06+00:00,BaiyaKashea,Apple Music so damn irritating bitch where is ...,en,"Nashville, TN",0,Apple Music damn irritating bitch music


In [None]:


path = 'text.txt'
with io.open(path, encoding='utf-8') as f:
    text = f.read().lower()
print('corpus length:', len(text))

chars = sorted(list(set(text)))
print('total chars:', len(chars))
char_indices = dict((c, i) for i, c in enumerate(chars))
indices_char = dict((i, c) for i, c in enumerate(chars))

# cut the text in semi-redundant sequences of maxlen characters
maxlen = 40
step = 3
sentences = []
next_chars = []
for i in range(0, len(text) - maxlen, step):
    sentences.append(text[i: i + maxlen])
    next_chars.append(text[i + maxlen])
print('nb sequences:', len(sentences))


corpus length: 275892
total chars: 50
nb sequences: 91951


In [None]:
print('Vectorization...')
from warnings import filterwarnings
filterwarnings(action='ignore', category=DeprecationWarning, message='`np.bool` is a deprecated alias')
x = np.zeros((len(sentences), maxlen, len(chars)), dtype=np.bool)

Vectorization...


In [None]:
y = np.zeros((len(sentences), len(chars)), dtype=np.bool)

In [None]:
for i, sentence in enumerate(sentences):
    for t, char in enumerate(sentence):
        x[i, t, char_indices[char]] = 1
    y[i, char_indices[next_chars[i]]] = 1

In [None]:

# build the model: a single LSTM
print('Build model...')
model = Sequential()
model.add(LSTM(128, input_shape=(maxlen, len(chars))))
model.add(Dense(len(chars), activation='softmax'))

optimizer = RMSprop(learning_rate=0.01)
model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=["accuracy"])


def sample(preds, temperature=1.0):
    # helper function to sample an index from a probability array
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)


def on_epoch_end(epoch, _):
    # Function invoked at end of each epoch. Prints generated text.
    print()
    print('----- Generating text after Epoch: %d' % epoch)

    start_index = random.randint(0, len(text) - maxlen - 1)
    for diversity in [0.2, 0.5, 1.0, 1.2]:
        print('----- diversity:', diversity)

        generated = ''
        sentence = text[start_index: start_index + maxlen]
        generated += sentence
        print('----- Generating with seed: "' + sentence + '"')
        sys.stdout.write(generated)

        for i in range(400):
            x_pred = np.zeros((1, maxlen, len(chars)))
            for t, char in enumerate(sentence):
                x_pred[0, t, char_indices[char]] = 1.

            preds = model.predict(x_pred, verbose=0)[0]
            next_index = sample(preds, diversity)
            next_char = indices_char[next_index]

            sentence = sentence[1:] + next_char

            sys.stdout.write(next_char)
            sys.stdout.flush()
        print()

print_callback = LambdaCallback(on_epoch_end=on_epoch_end)

model.fit(x, y,
          batch_size=128,
          epochs=30,
          callbacks=[print_callback])

Build model...
Epoch 1/30
----- Generating text after Epoch: 0
----- diversity: 0.2
----- Generating with seed: " samsung bur i logged wanted pick plan i"
 samsung bur i logged wanted pick plan i still apple music replay apple music replay apple music replay make apple music replay apple music replay apple music replay apple music replay apple music replay apple music replay apple music replay i listen made apple music replay apple music replay made spotify wrapped apple music replay spotify wrapped listen made apple music replay apple music replay apple music replay apple music replay ap
----- diversity: 0.5
----- Generating with seed: " samsung bur i logged wanted pick plan i"
 samsung bur i logged wanted pick plan im radilite live still apple music replay replay wond still chart tame apple music replay apple music lile apple music replay i listen spotify gring some i still apple music minut year apple music replay apple music replay still music listen gonna apple music replay wonkin

<keras.callbacks.History at 0x7f50038fffa0>

In [None]:
model.save('LSTM.h5')
print('Model Saved!')

Model Saved!
