In [None]:
import os

def scale_input_data(scale_factor):
  file_bases = ['./input/train', './input/test']
  for file_base in file_bases:
    import pandas as pd
    import shutil
    if scale_factor == 1.0:
      shutil.copyfile(file_base + '.csv', file_base + '.scaled.csv')
      continue
    df_to_scale = pd.read_csv(file_base + '.csv')
    new_num_rows = int(scale_factor * len(df_to_scale))
    if scale_factor <= 1.0:
      df_to_scale = df_to_scale.iloc[:new_num_rows]
    else:
      while len(df_to_scale) < new_num_rows:
        df_to_scale = pd.concat([df_to_scale, df_to_scale[:min(new_num_rows - len(df_to_scale), len(df_to_scale))]])
    df_to_scale.to_csv(file_base + '.scaled.csv', index=False)

if 'INPUT_SCALE_FACTOR' in os.environ:
  scale_input_data(float(os.environ['INPUT_SCALE_FACTOR']))

In [1]:
# import pandas as pd
exec(os.environ['IREWR_IMPORTS'])
import numpy as np
import re
# ALEX: remove ML code
# from nltk.stem import WordNetLemmatizer
# from nltk.corpus import names, stopwords
import unicodedata
# from sklearn.model_selection import train_test_split
# from sklearn.feature_extraction.text import TfidfVectorizer
# from sklearn.pipeline import Pipeline
# from sklearn.metrics import classification_report
# from sklearn.linear_model import LogisticRegression

# Importing Files

In [25]:
tweet_train_df = pd.read_csv('./input/train.scaled.csv')
tweet_test_df = pd.read_csv('./input/test.scaled.csv')
Id = tweet_test_df['id']

Unnamed: 0     int64
id             int64
keyword       object
location      object
text          object
target         int64
dtype: object

# Data Familiarization

In [3]:
tweet_train_df.head()

Unnamed: 0.1,Unnamed: 0,id,keyword,location,text,target
0,0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,1,4,,,Forest fire near La Ronge Sask. Canada,1
2,2,5,,,All residents asked to 'shelter in place' are ...,1
3,3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


Keyword and location  contains lots of Missing values.

In [4]:
tweet_train_df.isna().sum()

Unnamed: 0      0
id              0
keyword        31
location      748
text            0
target          0
dtype: int64

In [5]:
tweet_train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2283 entries, 0 to 2282
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  2283 non-null   int64 
 1   id          2283 non-null   int64 
 2   keyword     2252 non-null   object
 3   location    1535 non-null   object
 4   text        2283 non-null   object
 5   target      2283 non-null   int64 
dtypes: int64(3), object(3)
memory usage: 107.1+ KB


In [6]:
tweet_train_df.shape

(2283, 6)

7613 rows of tweets with 4 columns id, keyword, location and text all object

# Data Imputation

In [7]:
def data_imputation(data):
    data['keyword'].fillna(' ', inplace=True)
    data['location'].fillna(' ', inplace=True)
    return data

In [8]:
tweet_train_df = data_imputation(tweet_train_df)

In [9]:
tweet_test_df = data_imputation(tweet_test_df)

In [10]:
tweet_train_df.isna().sum()

Unnamed: 0    0
id            0
keyword       0
location      0
text          0
target        0
dtype: int64

In [11]:
tweet_train_df['text'] = tweet_train_df['text'] +' '+ tweet_train_df['location'] +' '+ tweet_train_df['keyword']
tweet_test_df['text'] = tweet_test_df['text'] +' '+ tweet_test_df['location'] +' '+ tweet_test_df['keyword']

All the missing columns are filled with appropriate values, later maybe we will change value of keyword dynamically for each row accordingly for now it's Na.
Missing locations are set to Unknown

# Text Filtering
Filtering Keyword, location and text columns by removing numbers, hashtags, names, Url and mentions

In [12]:
target = tweet_train_df['target']

In [13]:
tweet_train_df.drop(['target', 'location', 'keyword', 'id'], axis=1, inplace=True)
tweet_test_df.drop(['location', 'keyword', 'id'], axis=1, inplace=True)

In [14]:
# ALEX: remove ML code
# lemmetizer = WordNetLemmatizer()

In [15]:
# ALEX: remove ML code
# all_names = set(names.words())

In [16]:
# ALEX: remove ML code
# stop_words = set(stopwords.words('english'))

In [17]:
# ALEX: remove ML code
# tf_idf = TfidfVectorizer(min_df=0.1, max_df=0.7)

In [18]:
def cleaned_string(string):
    # Removing all the digits
    string = re.sub(r'\d', '', string)
    
    # Removing accented data
    string = unicodedata.normalize('NFKD', string).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    
    # Removing Mentions
    string = re.sub(r'@\w+', ' ', string)
    
    # Removing links 
    string = re.sub(r'(https?:\/\/)?([\da-zA-Z\.-\/\#\:]+)\.([\da-zA-Z\.\/\:\#]{0,9})([\/\w \.-\/\:\#]*)', ' ', string)
    
    # Removing all the digits special caharacters
    string = re.sub(r'\W', ' ', string)
        
    
    # Removing double whitespaces
    string = re.sub(r'\s+', ' ', string, flags=re.I)
    

    
    string = string.strip()
    
    #Removing all Single characters
    string = re.sub(r'\^[a-zA-Z]\s+','' , string)
    
    
    # Lemmetizing the string and removing stop words
    string = string.split()
# ALEX: remove ML code
#     string = [lemmetizer.lemmatize(word) for word in string if word not in stop_words and word not in all_names]
    string = ' '.join(string)
    
    # Lowercasing all data
    string = string.lower()
        
    return string

In [19]:
def clean_text(data):
    for i in range(data.shape[0]):
        for j in range(data.shape[1]):
            data.iloc[i, j] = cleaned_string(data.iloc[i, j])
    return data
            
            
    

In [22]:
tweet_cleaned_test_df = clean_text(tweet_test_df)

Unnamed: 0     int64
text          object
dtype: object

In [21]:
tweet_cleaned_test_df.shape

(3263, 1)

In [22]:
tweet_cleaned_test_df.head()

Unnamed: 0,text
0,just happened a terrible car crash
1,heard about earthquake is different cities sta...
2,there is a forest fire at spot pond geese are ...
3,apocalypse
4,typhoon soudelor kills in china and taiwan


In [23]:
tweet_cleaned_train_df = clean_text(tweet_train_df)

In [24]:
tweet_train_df.shape

(7613, 1)

In [25]:
tweet_cleaned_train_df.head()

Unnamed: 0,text
0,our deeds are the reason of this earthquake ma...
1,forest fire near la ronge
2,all residents asked to shelter in place are be...
3,people receive wildfires evacuation orders in ...
4,just got sent this photo from ruby alaska as s...


In [26]:
# ALEX: remove ML code
# X_train, X_valid, y_train, y_valid = train_test_split(tweet_cleaned_train_df['text'], target,random_state = 0)
_ = tweet_cleaned_train_df['text']


In [27]:
# ALEX: remove ML code
# catboost = LogisticRegression()

In [28]:
# ALEX: remove ML code
# pipeline_sgd = Pipeline([
#     ('tfidf',  TfidfVectorizer()),
#     ('nb', catboost,)
# ])

In [29]:
# ALEX: remove ML code
# model = pipeline_sgd.fit(X_train, y_train)

In [30]:
# ALEX: remove ML code
# y_predict = model.predict(X_valid)

In [31]:
# ALEX: remove ML code
# print(classification_report(y_valid, y_predict))

In [32]:
# ALEX: remove ML code
# y_pred_test = model.predict(tweet_cleaned_test_df['text'])
y_pred_test = tweet_cleaned_test_df['text']

In [33]:
# Saving result on test set
# ALEX: remove ML code
output = pd.DataFrame({'Id': Id,
                       'target': y_pred_test})

output.to_csv(r'submission.csv', index=False)