In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
path = pd.read_csv("/content/train.csv")

In [4]:
path.columns

Index(['id', 'tweet', 'state', 'location', 's1', 's2', 's3', 's4', 's5', 'w1',
       'w2', 'w3', 'w4', 'k1', 'k2', 'k3', 'k4', 'k5', 'k6', 'k7', 'k8', 'k9',
       'k10', 'k11', 'k12', 'k13', 'k14', 'k15'],
      dtype='object')

In [7]:
def _load_and_shuffle_data(data_path,
                           file_name,
                           cols,
                           seed,
                           separator=',',
                           header=0):
    """Loads and shuffles the dataset using pandas.
    # Arguments
        data_path: string, path to the data directory.
        file_name: string, name of the data file.
        cols: list, columns to load from the data file.
        seed: int, seed for randomizer.
        separator: string, separator to use for splitting data.
        header: int, row to use as data header.
    """
    np.random.seed(seed)
    data_path = os.path.join(data_path, file_name)
    data = pd.read_csv(data_path, usecols=cols, sep=separator, header=header)
    return data.reindex(np.random.permutation(data.index))


def _split_training_and_validation_sets(texts, labels, validation_split):
    """Splits the texts and labels into training and validation sets.
    # Arguments
        texts: list, text data.
        labels: list, label data.
        validation_split: float, percentage of data to use for validation.
    # Returns
        A tuple of training and validation data.
    """
    num_training_samples = int((1 - validation_split) * len(texts))
    return ((texts[:num_training_samples], labels[:num_training_samples]),
            (texts[num_training_samples:], labels[num_training_samples:]))
    
def load_tweet_weather_topic_classification_dataset(data_path,
                                                    validation_split=0.2,
                                                    seed=123):
   
    columns = [1] + [i for i in range(13, 28)]  # 1 - text, 13-28 - topics.
    data = _load_and_shuffle_data(data_path, 'train.csv', columns, seed)

    # Get tweet text and the max confidence score for the weather types.
    texts = list(data['tweet'])
    weather_data = data.iloc[:, 1:]

    labels = []
    for i in range(len(texts)):
        # Pick topic with the max confidence score.
        labels.append(np.argmax(list(weather_data.iloc[i, :].values)))

    return _split_training_and_validation_sets(
        texts, np.array(labels), validation_split)

In [8]:
(train_data, train_labels),(test_data, test_labels)=load_tweet_weather_topic_classification_dataset('')

In [9]:
train_data

['79.0F (Feels: 79.0F) - Humidity: 99% - Wind: 8.3mph N - Gust: 9.8mph - Pressure: 1010.6mb (+0.1)  #weather {link}',
 "I'm glad it's nice weather here for Bike to Work Day. Not that I did. Bike to work. I just like nice weather.",
 'Freezing Ass Cold. Sometimes I Hate Central Air. Smh.',
 '@mention I will trust what you say weather brandy!',
 'Powell WX (05/20/11 10:00 PM) Temp=66.0&#xB0;F \\ Daily Rain= 0.00 in. \\ Max Gust=4.9 mph',
 'Finding peace through the storm',
 'Breaking News? Maybe if it was 75 & Sunny! RT @mention Breaking News: Cape weather: Clouds coming {link}',
 "Watch Austins baseball practice from the car. It's too darn cold out!",
 "Good morning. Outside our house. It's sunny today. Great day.",
 '@mention lol the fiercest weather..aint that when we fell in like the mud or something',
 'Gorgeous sunshine outside, lovely drive, painless dr. visit, even got some work done. Not a bad day at all. Now for some Mexican food.',
 'Horrible storm in Missouri!!!!  Tornado des

In [10]:
train_labels

array([ 4, 12,  1, ...,  4,  6,  1])

In [14]:
X=train_data

In [15]:
len(X)

62356

In [16]:
y=train_labels

In [17]:
len(y)

62356

In [18]:
X[0]

'79.0F (Feels: 79.0F) - Humidity: 99% - Wind: 8.3mph N - Gust: 9.8mph - Pressure: 1010.6mb (+0.1)  #weather {link}'

In [20]:
y[0]

4

In [25]:
def tweet_preprocessor(tweet):
    tweet = tweet.replace('\n', ' ') # remove line breaks
    tweet = re.sub(r"\bhttps://t.co/\w+", '', tweet) # remove URL's
    tweet = re.sub('\w*\d\w*', ' ', tweet) # remove numbers
    tweet = re.sub('[%s]' % re.escape(string.punctuation), ' ',   
            tweet.lower()) # remove capital letters and punctuation
    
    return tweet

In [30]:
path.columns

Index(['id', 'tweet', 'state', 'location', 's1', 's2', 's3', 's4', 's5', 'w1',
       'w2', 'w3', 'w4', 'k1', 'k2', 'k3', 'k4', 'k5', 'k6', 'k7', 'k8', 'k9',
       'k10', 'k11', 'k12', 'k13', 'k14', 'k15'],
      dtype='object')

In [33]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,stratify=y)

In [34]:
X_train, X_val, y_train, y_val = train_test_split(X_train,y_train,test_size=0.2,stratify=y_train)

**Data preprocessing**

In [35]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest, f_classif

In [36]:
vectorizer = TfidfVectorizer( 
                             strip_accents='unicode', 
                             decode_error='replace',
                             dtype='int32',                              
                             analyzer="word", 
                             ngram_range=(1, 2), 
                             min_df=2)

In [37]:
X_train = vectorizer.fit_transform(X_train)



In [38]:
X_test = vectorizer.transform(X_test)

In [39]:
X_val = vectorizer.transform(X_val)

In [40]:
selector = SelectKBest(f_classif, k = min(20000, X_train.shape[1]))
selector.fit(X_train, y_train)

SelectKBest(k=20000)

In [41]:
X_train = selector.transform(X_train).astype('float32')
X_test = selector.transform(X_test).astype('float32')
X_val = selector.transform(X_val).astype('float32')

In [42]:
X_train.shape

(39907, 20000)

In [43]:
X_test.shape

(12472, 20000)

In [44]:
y_train = np.array(y_train)
y_test = np.array(y_test)
y_val = np.array(y_val)

In [45]:
X_train = X_train.toarray()
X_test = X_test.toarray()
X_val = X_val.toarray()

In [46]:
X_train.shape

(39907, 20000)

**ANN**

In [47]:
from keras.models import Sequential, load_model
from keras.layers import Dense, Dropout

In [48]:
newsANN = Sequential()

In [49]:
newsANN.add(Dense(units=512, activation='relu', input_dim=20000))
newsANN.add(Dense(units=1, activation='sigmoid'))

In [50]:
newsANN.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [51]:
from keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
es = EarlyStopping(monitor='val_accuracy', min_delta=0, patience=20, verbose=1, mode='auto', baseline=None, restore_best_weights=False)
mc = ModelCheckpoint(filepath='bestweights.h5', monitor='val_accuracy', verbose=1, save_best_only=True)
rd = ReduceLROnPlateau(monitor='val_accuracy', factor=0.1, patience=10, verbose=1, mode='auto')

In [52]:
history = newsANN.fit(X_train, y_train, epochs=20, callbacks=[es,rd,mc], validation_split=0.25)

Epoch 1/20
Epoch 1: val_accuracy improved from -inf to 0.09101, saving model to bestweights.h5
Epoch 2/20
Epoch 2: val_accuracy did not improve from 0.09101
Epoch 3/20
Epoch 3: val_accuracy did not improve from 0.09101
Epoch 4/20
Epoch 4: val_accuracy did not improve from 0.09101
Epoch 5/20
Epoch 5: val_accuracy did not improve from 0.09101
Epoch 6/20
Epoch 6: val_accuracy did not improve from 0.09101
Epoch 7/20
Epoch 7: val_accuracy did not improve from 0.09101
Epoch 8/20
Epoch 8: val_accuracy did not improve from 0.09101
Epoch 9/20
Epoch 9: val_accuracy did not improve from 0.09101
Epoch 10/20
Epoch 10: val_accuracy did not improve from 0.09101
Epoch 11/20
Epoch 11: ReduceLROnPlateau reducing learning rate to 0.00010000000474974513.

Epoch 11: val_accuracy did not improve from 0.09101
Epoch 12/20
Epoch 12: val_accuracy did not improve from 0.09101
Epoch 13/20
Epoch 13: val_accuracy did not improve from 0.09101
Epoch 14/20
Epoch 14: val_accuracy did not improve from 0.09101
Epoch 15/2

In [53]:
newmodel = load_model('bestweights.h5')

In [54]:
newmodel.evaluate(X_test, y_test)



[-8896.595703125, 0.09220654517412186]

**Deployment in web server**

In [55]:
!pip install flask gevent requests pillow flask-ngrok pyngrok

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting gevent
  Downloading gevent-21.12.0-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (5.8 MB)
[K     |████████████████████████████████| 5.8 MB 5.0 MB/s 
Collecting flask-ngrok
  Downloading flask_ngrok-0.0.25-py3-none-any.whl (3.1 kB)
Collecting pyngrok
  Downloading pyngrok-5.1.0.tar.gz (745 kB)
[K     |████████████████████████████████| 745 kB 35.1 MB/s 
Collecting zope.interface
  Downloading zope.interface-5.4.0-cp37-cp37m-manylinux2010_x86_64.whl (251 kB)
[K     |████████████████████████████████| 251 kB 49.0 MB/s 
[?25hCollecting zope.event
  Downloading zope.event-4.5.0-py2.py3-none-any.whl (6.8 kB)
Building wheels for collected packages: pyngrok
  Building wheel for pyngrok (setup.py) ... [?25l[?25hdone
  Created wheel for pyngrok: filename=pyngrok-5.1.0-py3-none-any.whl size=19007 sha256=430e5014da0b261723f16ac661baa56fa106dfb130fdded2d2e7521c21efcac3
  St

In [56]:
from flask_ngrok import run_with_ngrok
from flask import Flask, render_template, request
from keras.preprocessing.image import load_img, img_to_array

In [57]:
procfile = 'web: gunicorn app:app'
procfiles = open('/content/Procfile', 'w')

In [58]:
procfiles.write(procfile)

21

In [59]:
procfiles.close()

In [60]:
!mkdir '/content/templates'

**Connecting webpage with ANN**

In [61]:
import pyngrok

In [62]:
!ngrok authtoken 2ElvUsOWqaMY22pOX3juthVikxW_6fh4DFTCFRVEuTnbDgYHS

Authtoken saved to configuration file: /root/.ngrok2/ngrok.yml


In [66]:
app = Flask(__name__)
run_with_ngrok(app)

@app.route('/')
def home():
  return render_template('index.html')

@app.route('/', methods=['POST'])
def prediction():
  data = request.form['textbox']
  features = [data]

if __name__=='__main__':
  app.run()

 * Serving Flask app "__main__" (lazy loading)
 * Environment: production
[2m   Use a production WSGI server instead.[0m
 * Debug mode: off


INFO:werkzeug: * Running on http://127.0.0.1:5000/ (Press CTRL+C to quit)


 * Running on http://242f-35-188-101-44.ngrok.io
 * Traffic stats available on http://127.0.0.1:4040


INFO:werkzeug:127.0.0.1 - - [15/Sep/2022 15:34:05] "[37mGET / HTTP/1.1[0m" 200 -
INFO:werkzeug:127.0.0.1 - - [15/Sep/2022 15:34:06] "[33mGET /favicon.ico HTTP/1.1[0m" 404 -
ERROR:__main__:Exception on / [POST]
Traceback (most recent call last):
  File "/usr/local/lib/python3.7/dist-packages/flask/app.py", line 2447, in wsgi_app
    response = self.full_dispatch_request()
  File "/usr/local/lib/python3.7/dist-packages/flask/app.py", line 1953, in full_dispatch_request
    return self.finalize_request(rv)
  File "/usr/local/lib/python3.7/dist-packages/flask/app.py", line 1968, in finalize_request
    response = self.make_response(rv)
  File "/usr/local/lib/python3.7/dist-packages/flask/app.py", line 2098, in make_response
    "The view function did not return a valid response. The"
TypeError: The view function did not return a valid response. The function either returned None or ended without a return statement.
INFO:werkzeug:127.0.0.1 - - [15/Sep/2022 15:34:10] "[35m[1mPOST / HTTP