In [2]:
import numpy as np 
import pandas as pd
from sklearn import model_selection
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/veraachiaa/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/veraachiaa/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
train_data = pd.read_csv("Datasets/train.csv")
test_data = pd.read_csv("Datasets/test.csv")

tweets = train_data
tweets.head()

def clean_text(tweet_text):
    # Remove mentions
    mention_pattern = re.compile(r'@\w+')
    tweet_text = re.sub(mention_pattern, '', tweet_text)

    # Remove Hashtags
    tweet_text = re.sub(r'\#\w+', '', tweet_text)

    # Remove retweets
    tweet_text = re.sub(r'RT[\s]+', '', tweet_text)

    # Remove urls
    tweet_text = re.sub(r'https?:\/\/\S+', '', tweet_text)

    # Remove emojis
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002600-\U000027BF"  # miscellaneous symbols
                               u"\U0001F910-\U0001F9FF"  # faces with accessories
                               u"\u200d"  # zero-width joiner
                               u"\u2600-\u26FF\u2700-\u27BF"  # additional symbols
                               u"\u3000-\u303F"  # punctuation symbols
                               u"\uFE0F"  # emoji variation selector
                               "]+", flags=re.UNICODE)
    tweet_text = re.sub(emoji_pattern, '', tweet_text)

    # Remove newlines
    newline_pattern = re.compile(r'\n')
    tweet_text = re.sub(newline_pattern, '', tweet_text)

    # Remove stopwords
    tokens = word_tokenize(tweet_text)

    # Get the list of stopwords
    stop_words = set(stopwords.words('english'))

    # Remove stopwords from the tokens
    filtered_tokens = [word for word in tokens if word.lower() not in stop_words]

    # Join the filtered tokens back into a single string
    filtered_tweet = ' '.join(filtered_tokens)

    return filtered_tweet


tweets['text'] = tweets['text'].apply(clean_text)

In [4]:
tweets.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Deeds Reason May ALLAH Forgive us,1
1,4,,,Forest fire near La Ronge Sask . Canada,1
2,5,,,residents asked 'shelter place ' notified offi...,1
3,6,,,"13,000 people receive evacuation orders Califo...",1
4,7,,,got sent photo Ruby smoke pours school,1


In [5]:
# Taining and Testing
X = tweets['text']
y = tweets['target']

X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.2, random_state=42)

# Using TF-IDF vectorizer to convert text into numerical features
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train)

#Logistics Regression
model = LogisticRegression()

# Train the model
model.fit(X_train_tfidf, y_train)

X_test = test_data['text']
X_test_tfidf = vectorizer.transform(X_test)

# Making predictions on the test set
predictions = model.predict(X_test_tfidf)

test_data['prediction'] = predictions


# Print the predictions
print(test_data[['text', 'prediction']])
test_data.to_csv('test_predictions.csv', index=False)

                                                   text  prediction
0                    Just happened a terrible car crash           1
1     Heard about #earthquake is different cities, s...           1
2     there is a forest fire at spot pond, geese are...           1
3              Apocalypse lighting. #Spokane #wildfires           0
4         Typhoon Soudelor kills 28 in China and Taiwan           1
...                                                 ...         ...
3258  EARTHQUAKE SAFETY LOS ANGELES ÛÒ SAFETY FASTE...           1
3259  Storm in RI worse than last hurricane. My city...           1
3260  Green Line derailment in Chicago http://t.co/U...           1
3261  MEG issues Hazardous Weather Outlook (HWO) htt...           1
3262  #CityofCalgary has activated its Municipal Eme...           1

[3263 rows x 2 columns]
