### Libraries

In [1]:
# Varios
# ------------------------------------------------------------------------------
from collections import defaultdict
import os.path
import logging
from sinfo import sinfo
import re
import string
import math

# Tratamiento de textos
# ------------------------------------------------------------------------------
from scipy import sparse
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize

# Tratamiento de datos
# ------------------------------------------------------------------------------
import pandas as pd
import numpy as np

# Preprocesado y modelado
# ------------------------------------------------------------------------------

#----------- gensim ----------
import gensim
from gensim.models import KeyedVectors
from gensim.scripts.glove2word2vec import glove2word2vec
from gensim.utils import simple_preprocess
from gensim.models import Word2Vec
from gensim.sklearn_api import TfIdfTransformer
from gensim.test.utils import common_corpus, common_dictionary
from gensim import models, corpora, matutils

# ----------- sklearn ----------
from sklearn.linear_model import LogisticRegression # model
from sklearn import model_selection
import joblib
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

## Read Data and Explorer

In [2]:
def read_data(filename, sep=','):
    """
        filename: path dataSet
        sep: separator text     

        return: data
    """
    data = pd.read_csv(filename, sep=sep, encoding='utf-8')
    return data

In [17]:
df=read_data('/Users/Cristian/OneDrive - Universidad Complutense de Madrid (UCM)/Portfolio_DS/Kaggle_challenges/NLP_Real_or_Not/Data/train.csv')

### Problem:

**What files do I need?**

You'll need train.csv, test.csv and sample_submission.csv.

__What should I expect the data format to be?__

Each sample in the train and test set has the following information:

The text of a tweet
A keyword from that tweet (although this may be blank!)
The location the tweet was sent from (may also be blank)
What am I predicting?

You are predicting whether a given tweet is about a real disaster or not. If so, predict a 1. If not, predict a 0.

__Files__

train.csv - the training set
test.csv - the test set
sample_submission.csv - a sample submission file in the correct format

__Columns__

id - a unique identifier for each tweet

text - the text of the tweet

location - the location the tweet was sent from (may be blank)

keyword - a particular keyword from the tweet (may be blank)

target - in train.csv only, this denotes whether a tweet is about a real disaster (1) or not (0)


In [18]:
## Exploracion por campos. 
# De primeras se entiende que id solamente es indexador y no aportara nada de informacion.

# Dataframe preview
df.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [20]:
# si hay 10873 y el id fuese indexador habria 10872 filas. Asi que a pesar de no haber identificadores repetidos
# como indica el enunciado habria que ver si no guardan relacion. Esperemos que no... por que esto es NLP 
df.iloc[0:]['id']

# Tenemos 7613 tweets.

0           1
1           4
2           5
3           6
4           7
        ...  
7608    10869
7609    10870
7610    10871
7611    10872
7612    10873
Name: id, Length: 7613, dtype: int64

In [None]:
print(type(df.iloc[1]['location']), type(df.iloc[1]['keyword']))

# Extrano que keyword sea un float. Location puede ser al tener que ser coordenadas geo.
df.location.unique() # -> Deberia ser un string viendo los valores unicos
df.keyword.unique() # -> Deberia ser un string.

In [49]:
# llenamos los nan o NaN values a 0.

df['keyword'] = df['keyword'].fillna(0)
df['location'] = df['location'].fillna(0)

In [50]:
df.head()

Unnamed: 0,id,keyword,location,text,target
0,1,0,0,Our Deeds are the Reason of this #earthquake M...,1
1,4,0,0,Forest fire near La Ronge Sask. Canada,1
2,5,0,0,All residents asked to 'shelter in place' are ...,1
3,6,0,0,"13,000 people receive #wildfires evacuation or...",1
4,7,0,0,Just got sent this photo from Ruby #Alaska as ...,1


In [68]:
# transformamos a keyword en string
df['keyword'] = df['keyword'].apply(str)# -> Deberia ser un string viendo los valores unicos
# df.keyword.unique() # -> Deberia ser un string.

df['location']= df['location'].apply(str)

In [110]:
keywords=df[(df['keyword']!='0')&(df['location']!='0')]
subset_1=keywords
subset_1
# Se podria aprender sobre un subset con todos los elementos para luego imputar sobre los valores vacios '0'

Unnamed: 0,id,keyword,location,text,target
31,48,ablaze,Birmingham,@bbcmtd Wholesale Markets ablaze http://t.co/l...,1
32,49,ablaze,Est. September 2012 - Bristol,We always try to bring the heavy. #metal #RT h...,0
33,50,ablaze,AFRICA,#AFRICANBAZE: Breaking news:Nigeria flag set a...,1
34,52,ablaze,"Philadelphia, PA",Crying out for more! Set me ablaze,0
35,53,ablaze,"London, UK",On plus side LOOK AT THE SKY LAST NIGHT IT WAS...,0
...,...,...,...,...,...
7575,10826,wrecked,TN,On the bright side I wrecked http://t.co/uEa0t...,0
7577,10829,wrecked,#NewcastleuponTyne #UK,@widda16 ... He's gone. You can relax. I thoug...,0
7579,10831,wrecked,"Vancouver, Canada",Three days off from work and they've pretty mu...,0
7580,10832,wrecked,London,#FX #forex #trading Cramer: Iger's 3 words tha...,0
