# Challenge 1

In [1]:
import nltk

In [2]:
from nltk.corpus import brown

In [3]:
brown.words()[0:10]

['The',
 'Fulton',
 'County',
 'Grand',
 'Jury',
 'said',
 'Friday',
 'an',
 'investigation',
 'of']

In [4]:
brown.tagged_words()

[('The', 'AT'), ('Fulton', 'NP-TL'), ...]

In [5]:
text = 'Ironhack is a Global Tech School ranked num 2 worldwide. Our mission is to help people transform their careers and join a thriving community of tech professionals that love what they do. This ideology is reflected in our teaching practices, which consist of a nine-weeks immersive programming, UX/UI design or Data Analytics course as well as a one-week hiring fair aimed at helping our students change their career and get a job straight after the course. We are present in 8 countries and have campuses in 9 locations - Madrid, Barcelona, Miami, Paris, Mexico City,  Berlin, Amsterdam, Sao Paulo and Lisbon.'

In [6]:
from nltk import sent_tokenize, word_tokenize

In [7]:
sent_tokenize(text)

['Ironhack is a Global Tech School ranked num 2 worldwide.',
 'Our mission is to help people transform their careers and join a thriving community of tech professionals that love what they do.',
 'This ideology is reflected in our teaching practices, which consist of a nine-weeks immersive programming, UX/UI design or Data Analytics course as well as a one-week hiring fair aimed at helping our students change their career and get a job straight after the course.',
 'We are present in 8 countries and have campuses in 9 locations - Madrid, Barcelona, Miami, Paris, Mexico City,  Berlin, Amsterdam, Sao Paulo and Lisbon.']

In [8]:
word_tokenize(text)

['Ironhack',
 'is',
 'a',
 'Global',
 'Tech',
 'School',
 'ranked',
 'num',
 '2',
 'worldwide',
 '.',
 'Our',
 'mission',
 'is',
 'to',
 'help',
 'people',
 'transform',
 'their',
 'careers',
 'and',
 'join',
 'a',
 'thriving',
 'community',
 'of',
 'tech',
 'professionals',
 'that',
 'love',
 'what',
 'they',
 'do',
 '.',
 'This',
 'ideology',
 'is',
 'reflected',
 'in',
 'our',
 'teaching',
 'practices',
 ',',
 'which',
 'consist',
 'of',
 'a',
 'nine-weeks',
 'immersive',
 'programming',
 ',',
 'UX/UI',
 'design',
 'or',
 'Data',
 'Analytics',
 'course',
 'as',
 'well',
 'as',
 'a',
 'one-week',
 'hiring',
 'fair',
 'aimed',
 'at',
 'helping',
 'our',
 'students',
 'change',
 'their',
 'career',
 'and',
 'get',
 'a',
 'job',
 'straight',
 'after',
 'the',
 'course',
 '.',
 'We',
 'are',
 'present',
 'in',
 '8',
 'countries',
 'and',
 'have',
 'campuses',
 'in',
 '9',
 'locations',
 '-',
 'Madrid',
 ',',
 'Barcelona',
 ',',
 'Miami',
 ',',
 'Paris',
 ',',
 'Mexico',
 'City',
 ',',
 '

# Challenge 2

In [9]:
import re

In [10]:
x = '''@Ironhack's-#Q website 776-is http://ironhack.com [(2018)]")'''

In [11]:
def clean_up(s):
    x = re.sub('\d','',re.sub('\W',' ',str(s.lower().split(' http:')[0])).strip())
    return x

In [12]:
x = clean_up(x)

In [13]:
x = word_tokenize(x)

In [14]:
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer

In [15]:
stem = PorterStemmer()
lemma = WordNetLemmatizer()

In [16]:
def stem_and_lemmatize(lst):
    x_stem_lemma = []
    for i in range(len(lst)):
        x_stem_lemma.append(lemma.lemmatize(stem.stem(lst[i])))
        
    return x_stem_lemma

In [17]:
x = stem_and_lemmatize(x)

In [18]:
from nltk.corpus import stopwords

In [19]:
def remove_stopwords(lst):
    x_nosw = []
    stopWords = set(stopwords.words('english'))
    for i in lst:
        if i not in stopWords:
            x_nosw.append(i)
    return x_nosw

In [20]:
remove_stopwords(x)

['ironhack', 'q', 'websit']

# Challenge 3

In [21]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer

In [22]:
txt = "Ironhack is a Global Tech School ranked num 2 worldwide. Our mission is to help people transform their careers and join a thriving community of tech professionals that love what they do."

In [23]:
analyzer = SentimentIntensityAnalyzer()

In [24]:
analyzer.polarity_scores(txt)

{'neg': 0.0, 'neu': 0.741, 'pos': 0.259, 'compound': 0.8442}

In [25]:
import pandas as pd
import zipfile

In [26]:
zf=zipfile.ZipFile('Sentiment140.csv.zip')
chunks=pd.read_csv(zf.open('Sentiment140.csv'), chunksize = 200000)
data = pd.concat(chunks)

In [27]:
data.head()

Unnamed: 0,target,id,date,flag,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [28]:
data['text_processed'] = data.text.apply(lambda x : remove_stopwords(stem_and_lemmatize(word_tokenize(clean_up(x)))))

In [29]:
data.head()

Unnamed: 0,target,id,date,flag,user,text,text_processed
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",[switchfoot]
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...,"[upset, updat, hi, facebook, text, might, cri,..."
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...,"[kenichan, dive, mani, time, ball, manag, save..."
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire,"[whole, bodi, feel, itchi, like, fire]"
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all....","[nationwideclass, behav, mad, whi, becaus, see]"


In [30]:
from nltk.probability import ConditionalFreqDist
from nltk.probability import FreqDist
import nltk
from sklearn.model_selection import train_test_split

In [31]:
bag = []
for t in range(len((data.text_processed))):
    bag += data.text_processed[t]

In [32]:
wordfreq = {}
for token in bag:
    if token not in wordfreq.keys():
        wordfreq[token] = 1
    else:
        wordfreq[token] += 1

In [33]:
len(wordfreq)

545188

In [36]:
allwords = nltk.FreqDist(bag)

In [52]:
len(allwords)

545188

In [38]:
wordfreq5000 = {}
for k,v in allwords.items():
    if v > 5000:
        wordfreq5000[k] = v

In [53]:
wordfreq5000

{'updat': 9207,
 'hi': 24782,
 'text': 5716,
 'might': 9692,
 'cri': 8655,
 'school': 21050,
 'today': 68567,
 'also': 10348,
 'mani': 8911,
 'time': 66038,
 'rest': 5898,
 'go': 138211,
 'whole': 5976,
 'feel': 51128,
 'like': 82963,
 'whi': 28369,
 'becaus': 13570,
 'see': 50842,
 'need': 43211,
 'hug': 5205,
 'hey': 19025,
 'long': 17177,
 'ye': 18914,
 'rain': 17170,
 'bit': 13113,
 'onli': 27925,
 'lol': 59028,
 'thank': 58961,
 'break': 6225,
 'watch': 43513,
 'thought': 12659,
 'wa': 104501,
 'either': 5336,
 'never': 17857,
 'talk': 14882,
 'anymor': 6647,
 'would': 27135,
 'first': 16740,
 'realli': 49707,
 'though': 23834,
 'wish': 33737,
 'got': 70849,
 'miss': 56968,
 'hurt': 14119,
 'alway': 14853,
 'want': 56949,
 'love': 81237,
 'oh': 39644,
 'drink': 7654,
 'day': 108719,
 'get': 110389,
 'much': 36805,
 'done': 15311,
 'one': 56716,
 'friend': 25937,
 'call': 15560,
 'ask': 7167,
 'meet': 9437,
 'thi': 93303,
 'week': 28258,
 'hope': 44597,
 'class': 7620,
 'tomorrow':

In [55]:
word_token = list(wordfreq5000.keys())
word_token

['updat',
 'hi',
 'text',
 'might',
 'cri',
 'school',
 'today',
 'also',
 'mani',
 'time',
 'rest',
 'go',
 'whole',
 'feel',
 'like',
 'whi',
 'becaus',
 'see',
 'need',
 'hug',
 'hey',
 'long',
 'ye',
 'rain',
 'bit',
 'onli',
 'lol',
 'thank',
 'break',
 'watch',
 'thought',
 'wa',
 'either',
 'never',
 'talk',
 'anymor',
 'would',
 'first',
 'realli',
 'though',
 'wish',
 'got',
 'miss',
 'hurt',
 'alway',
 'want',
 'love',
 'oh',
 'drink',
 'day',
 'get',
 'much',
 'done',
 'one',
 'friend',
 'call',
 'ask',
 'meet',
 'thi',
 'week',
 'hope',
 'class',
 'tomorrow',
 'hate',
 'wake',
 'peopl',
 'sleep',
 'im',
 'sad',
 'ok',
 'almost',
 'everi',
 'make',
 'new',
 'may',
 'b',
 'morn',
 'work',
 'eye',
 'night',
 'sick',
 'hour',
 'sit',
 'caus',
 'back',
 'bed',
 'ill',
 'tell',
 'ya',
 'later',
 'good',
 'sorri',
 'came',
 'think',
 'even',
 'know',
 'kid',
 'anoth',
 'gon',
 'na',
 'studi',
 'exam',
 'ha',
 'enough',
 'heart',
 'wan',
 'still',
 'awww',
 'soo',
 'final',
 'fall'

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
tfidf=TfidfVectorizer(min_df=0.30, tokenizer=word_token)
tfidf

In [64]:
def features(col):
    features = {}
    x = set(col)
    for i in word_token:
        features[i] = (i in x)    
    z = analyzer.polarity_scores(" ".join(col))
    if z['pos'] > 0.5:
        s = True
    else:
        s = False
        
    return (features, s)

In [71]:
featmax = data.text_processed.apply(features)

In [72]:
featmax[100][1]

False

In [73]:
train_set = featmax[:800000]
test_set = featmax[800000:]

In [74]:
type(train_set)

pandas.core.series.Series

In [75]:
classifier = nltk.NaiveBayesClassifier.train(train_set)

In [76]:
nltk.classify.accuracy(classifier, test_set)

0.86354375