In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder
import re
from nltk import TweetTokenizer
from scipy.sparse import hstack

In [2]:
data = pd.read_csv('Tweets_concated_row.csv')

In [3]:
data.head()

Unnamed: 0,gender,tweet
0,male,one to watch … \r\r\navailable on 10th feb. ht...
1,female,are we living in a holographic universe? new s...
2,female,"museum focus, but still great pieces of advice..."
3,female,best half time show ever! <eot> @jannarden no...
4,male,does this mean @waitakereunited are top of the...


In [5]:
data.tweet[0]



In [4]:
data['tweet'] = data.tweet.apply(lambda x: re.sub(r"\B@\w+", "<user>", x))
data['tweet'] = data.tweet.apply(lambda x: re.sub('http\S+\s*', '<url>', x))

In [5]:
y = data.copy().gender
X = data.copy().tweet

## Trying Char N-Grams

In [15]:
tfidf_vectorizer_Char2_6 = TfidfVectorizer(
            lowercase=True,
            sublinear_tf=True,
            strip_accents='unicode',
            analyzer='char',
            ngram_range=(2, 6),
            max_features=15000)

In [16]:
X_char_2_6 = tfidf_vectorizer_Char2_6.fit_transform(X)

In [18]:
cv_results = cross_val_score(LogisticRegression(), X_char_2_6, y)



In [19]:
print('Char 2-6 CV Mean: %f std: %f'%(np.mean(cv_results), np.std(cv_results)))

Char 2-6 CV Mean: 0.793333 std: 0.015861


In [20]:
tfidf_vectorizer_Char2_6 = TfidfVectorizer(
            lowercase=True,
            sublinear_tf=True,
            strip_accents='unicode',
            analyzer='char',
            ngram_range=(2, 6),
            max_features=10000)
X_char_2_6 = tfidf_vectorizer_Char2_6.fit_transform(X)
cv_results = cross_val_score(LogisticRegression(), X_char_2_6, y)
print('Char 2-6 CV Mean: %f std: %f'%(np.mean(cv_results), np.std(cv_results)))



Char 2-6 CV Mean: 0.789000 std: 0.014765


In [21]:
tfidf_vectorizer_Char1_6 = TfidfVectorizer(
            lowercase=True,
            sublinear_tf=True,
            strip_accents='unicode',
            analyzer='char',
            ngram_range=(1, 6),
            max_features=15000)
X_char_1_6 = tfidf_vectorizer_Char1_6.fit_transform(X)
cv_results = cross_val_score(LogisticRegression(), X_char_1_6, y)
print('Char 1-6 CV Mean: %f std: %f'%(np.mean(cv_results), np.std(cv_results)))



Char 2-6 CV Mean: 0.793333 std: 0.015628


In [33]:
tfidf_vectorizer_Char3_5 = TfidfVectorizer(
            lowercase=True,
            sublinear_tf=True,
            strip_accents='unicode',
            analyzer='char',
            ngram_range=(3, 5),
            max_features=15000)
X_char_3_5 = tfidf_vectorizer_Char3_5.fit_transform(X)
cv_results = cross_val_score(LogisticRegression(), X_char_3_5, y)
print('Char 3-5 CV Mean: %f std: %f'%(np.mean(cv_results), np.std(cv_results)))



Char 3-5 CV Mean: 0.799333 std: 0.013888


In [27]:
ctv = CountVectorizer(analyzer='word',
                    token_pattern=r'\w{1,}',
                    ngram_range=(2, 6),
                    max_features=15000)
X_char_2_6 = ctv.fit_transform(X)
cv_results = cross_val_score(LogisticRegression(), X_char_2_6, y)
print('Char 2-6 CV Mean: %f std: %f'%(np.mean(cv_results), np.std(cv_results)))



Char 1-6 CV Mean: 0.749667 std: 0.004714


In [28]:
ctv = CountVectorizer(analyzer='word',
                    token_pattern=r'\w{1,}',
                    ngram_range=(2, 6),
                    max_features=10000)
X_char_2_6 = ctv.fit_transform(X)
cv_results = cross_val_score(LogisticRegression(), X_char_2_6, y)
print('Char 2-6 CV Mean: %f std: %f'%(np.mean(cv_results), np.std(cv_results)))



Char 2-6 CV Mean: 0.741333 std: 0.007318


## Trying word N-grams

In [16]:
tweetTokenizer = TweetTokenizer()

In [29]:
tfidf_vectorizer = TfidfVectorizer(
                sublinear_tf=True,
                strip_accents='unicode',
                analyzer='word',
                tokenizer=tweetTokenizer.tokenize,
                ngram_range=(1, 3),
                max_features=10000)
X_vect = tfidf_vectorizer.fit_transform(X)
cv_results = cross_val_score(LogisticRegression(), X_vect, y)
print('Word 1-3 Grams CV Mean: %f std: %f'%(np.mean(cv_results), np.std(cv_results)))



Word 1-3 Grams CV Mean: 0.805333 std: 0.021792


In [30]:
tfidf_vectorizer = CountVectorizer(
                strip_accents='unicode',
                analyzer='word',
                tokenizer=tweetTokenizer.tokenize,
                ngram_range=(1, 3),
                max_features=10000)
X_vect = tfidf_vectorizer.fit_transform(X)
cv_results = cross_val_score(LogisticRegression(), X_vect, y)
print('Word 1-3 Grams CV Mean: %f std: %f'%(np.mean(cv_results), np.std(cv_results)))



Word 1-3 Grams CV Mean: 0.777333 std: 0.014197


# Char N-grams + Word N-grams

In [32]:
tfidf_vectorizer = TfidfVectorizer(
                sublinear_tf=True,
                strip_accents='unicode',
                analyzer='word',
                tokenizer=tweetTokenizer.tokenize,
                ngram_range=(1, 3),
                max_features=10000)
X_vect = tfidf_vectorizer.fit_transform(X)

tfidf_vectorizer_Char2_6 = TfidfVectorizer(
            lowercase=True,
            sublinear_tf=True,
            strip_accents='unicode',
            analyzer='char',
            ngram_range=(2, 6),
            max_features=10000)
X_char_2_6 = tfidf_vectorizer_Char2_6.fit_transform(X)
cv_results = cross_val_score(LogisticRegression(), hstack([X_vect,X_char_2_6]), y)
print('Word 1-3-Grams + Char 2-6-Grams CV Mean: %f std: %f'%(np.mean(cv_results), np.std(cv_results)))



Word 1-3-Grams + Char 2-6-Grams CV Mean: 0.814667 std: 0.016938


# Using LSA for dim reducing

In [8]:
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import Normalizer

# Word N-grams

In [17]:
tfidf_vectorizer = TfidfVectorizer(
                sublinear_tf=True,
                strip_accents='unicode',
                analyzer='word',
                tokenizer=tweetTokenizer.tokenize,
                ngram_range=(1, 3),
                max_features=15000)
X_vect = tfidf_vectorizer.fit_transform(X)


In [18]:
svd_words = TruncatedSVD(n_components=300, n_iter=7, random_state=42)
svd_words.fit(X_vect)

TruncatedSVD(algorithm='randomized', n_components=300, n_iter=7,
       random_state=42, tol=0.0)

In [19]:
X_vect_svd = svd_words.transform(X_vect)


In [20]:
cv_results = cross_val_score(LogisticRegression(), X_vect_svd, y)
print('Word 1-3-Grams with SVD (comp=300): %f std: %f'%(np.mean(cv_results), np.std(cv_results)))

Word 1-3-Grams with SVD (comp=300): 0.807000 std: 0.021463




In [21]:
X_vect_svd_norm = Normalizer().fit_transform(X_vect_svd)
cv_results = cross_val_score(LogisticRegression(), X_vect_svd_norm, y)
print('Word 1-3-Grams with SVD (comp=300), normalized: %f std: %f'%(np.mean(cv_results), np.std(cv_results)))



Word 1-3-Grams with SVD (comp=300), normalized: 0.812667 std: 0.017327


## Char N grams 

In [6]:
tfidf_vectorizer_Char2_6 = TfidfVectorizer(
            lowercase=True,
            sublinear_tf=True,
            strip_accents='unicode',
            analyzer='char',
            ngram_range=(2, 6),
            max_features=15000)
X_char_2_6 = tfidf_vectorizer_Char2_6.fit_transform(X)

In [9]:
svd_chars = TruncatedSVD(n_components=300, n_iter=7, random_state=42)
svd_chars.fit(X_char_2_6)
X_vect_svd_char = svd_chars.transform(X_char_2_6)

In [10]:
cv_results = cross_val_score(LogisticRegression(), X_vect_svd_char, y)
print('Char 2-6-Grams with SVD (comp=300): %f std: %f'%(np.mean(cv_results), np.std(cv_results)))

Word 1-3-Grams with SVD (comp=300): 0.800000 std: 0.018493




In [14]:
X_vect_svd_char_norm = Normalizer().fit_transform(X_vect_svd_char)
cv_results = cross_val_score(LogisticRegression(), X_vect_svd_char_norm, y)
print('Char 2-6-Grams with SVD (comp=300), normalized: %f std: %f'%(np.mean(cv_results), np.std(cv_results)))



Char 2-6-Grams with SVD (comp=300), normalized: 0.806000 std: 0.017907


In [28]:
X_vect_svd_char_norm.shape

(3000, 300)

In [31]:
type(X_vect_svd_norm)

numpy.ndarray

In [29]:
X_vect_svd_norm.shape

(3000, 300)

In [33]:
np.hstack([X_vect_svd_norm, X_vect_svd_char_norm]).shape

(3000, 600)

## Word svd + Char svd 

In [34]:
cv_results = cross_val_score(LogisticRegression(),
                             np.hstack([X_vect_svd_norm, X_vect_svd_char_norm]), y)
print('Char 2-6-Grams with SVD (comp=300), normalized: %f std: %f'%(np.mean(cv_results), np.std(cv_results)))



Char 2-6-Grams with SVD (comp=300), normalized: 0.821333 std: 0.014817




# Check the accuracy on the test data

In [36]:
test_data = pd.read_csv('test_tweets_row.csv')
test_data['tweet'] = test_data.tweet.apply(lambda x: re.sub(r"\B@\w+", "<user>", x))
test_data['tweet'] = test_data.tweet.apply(lambda x: re.sub('http\S+\s*', '<url>', x))

In [37]:
X_test = test_data.tweet
y_test = test_data.gender

In [38]:
test_word_vect = tfidf_vectorizer.transform(X_test)
test_word_svd  = svd_words.transform(test_word_vect)
svd_word_normalizer = Normalizer()
svd_word_normalizer.fit(X_vect_svd)
test_word_svd_norm = svd_word_normalizer.transform(test_word_svd)

In [44]:
test_char_vect = tfidf_vectorizer_Char2_6.transform(X_test)
test_char_svd = svd_chars.transform(test_char_vect)
svd_char_normalizer = Normalizer()
svd_char_normalizer.fit(X_vect_svd_char)
test_char_svd_norm = svd_char_normalizer.transform(test_char_svd)

In [39]:
lr = LogisticRegression()
lr.fit(np.hstack([X_vect_svd_norm, X_vect_svd_char_norm]), y)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [46]:
lr.score(np.hstack([X_vect_svd_norm, X_vect_svd_char_norm]), y)

0.87

In [45]:
lr.score(np.hstack([test_word_svd_norm, test_char_svd_norm]), y_test)

0.8221052631578948