In [18]:
import numpy as np
import pandas as pd

# Load positive tweets
pos_tweets = []
with open('twitter-datasets/train_pos_full.txt', encoding = 'utf-8') as f:
    for line in f:
        pos_tweets.append(line)
        
# Load negative tweets
neg_tweets = []
with open('twitter-datasets/train_neg_full.txt', encoding = 'utf-8') as f:
    for line in f:
        neg_tweets.append(line)

In [19]:
all_tweets = np.array(pos_tweets + neg_tweets)

In [20]:
y = np.concatenate((np.ones(len(pos_tweets)), np.zeros(len(neg_tweets))))

random_idxs = np.random.permutation(len(y))

all_tweets = all_tweets[random_idxs]

y = y[random_idxs]

N_train = int(0.8*len(y))

train, val = all_tweets[:N_train], all_tweets[N_train:]
y_train, y_val = y[:N_train], y[N_train:]

In [21]:
from sklearn.feature_extraction.text import TfidfVectorizer 
 
tfidf_vectorizer=TfidfVectorizer(use_idf=True, ngram_range = (1,2)) 
X_train = tfidf_vectorizer.fit_transform(train)
X_val = tfidf_vectorizer.transform(val)

In [22]:
first_vector_tfidfvectorizer=X_train[0] 
df = pd.DataFrame(first_vector_tfidfvectorizer.T.todense(), index=tfidf_vectorizer.get_feature_names(), columns=["tfidf"]) 
df.sort_values(by=["tfidf"],ascending=False)


Unnamed: 0,tfidf
emojis too,0.384175
too notfair,0.373661
use emojis,0.351690
iphone so,0.297310
notfair,0.281223
...,...
from bungaterakhir,0.000000
from bung,0.000000
from bundaberg,0.000000
from bunch,0.000000


In [23]:
from sklearn import linear_model

clf =  linear_model.SGDClassifier(loss = 'log', max_iter=int(1e7), tol=1e-5, verbose = False)


clf.fit(X_train, y_train)
train_acc = (clf.predict(X_train) == y_train).mean()
val_acc = (clf.predict(X_val) == y_val).mean()
print('Training set accuracy: {:.2f}% / validation set: {:.2f}%'.format(100*train_acc, 100*val_acc))

Training set accuracy: 76.38% / validation set: 76.57%


In [24]:
from sklearn.svm import LinearSVC

clf = LinearSVC(random_state=0, tol=1e-9, loss = 'squared_hinge', dual = True, C = 0.03)
clf.fit(X_train, y_train)
train_acc = (clf.predict(X_train) == y_train).mean()
val_acc = (clf.predict(X_val) == y_val).mean()
print('Training set accuracy: {:.2f}% / validation set: {:.2f}%'.format(100*train_acc, 100*val_acc))

Training set accuracy: 86.12% / validation set: 84.30%


In [None]:
from sklearn.decomposition import TruncatedSVD

svd = TruncatedSVD(n_components=1000, n_iter=7, random_state=42)

X_train_ = svd.fit_transform(X_train)
X_val_ = svd.transform(X_val)


clf = LinearSVC(random_state=0, tol=1e-9, loss = 'squared_hinge', dual = False, C = 0.1)
clf.fit(X_train_, y_train)
train_acc = (clf.predict(X_train_) == y_train).mean()
val_acc = (clf.predict(X_val_) == y_val).mean()
print('Training set accuracy: {:.2f}% / validation set: {:.2f}%'.format(100*train_acc, 100*val_acc))

In [None]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators=100, random_state=0)  
clf.fit(X_train, y_train)


predictions = clf.predict(X_val)

In [None]:
train_acc = (clf.predict(X_train) == y_train).mean()
val_acc = (clf.predict(X_val) == y_val).mean()
print('Training set accuracy: {:.2f}% / validation set: {:.2f}%'.format(100*train_acc, 100*val_acc))

### Predictions

In [29]:
test_tweets = []
with open('twitter-datasets/test_data.txt', encoding = 'utf-8') as f:
    for line in f:
        sp = line.split(',')
        index = sp[0]
        test_tweets.append(','.join(sp[1:]))
        

        
tfidf_vectorizer=TfidfVectorizer(use_idf=True, ngram_range = (1,2)) 
X_train_final = tfidf_vectorizer.fit_transform(all_tweets)
X_test = tfidf_vectorizer.transform(test_tweets)


clf = LinearSVC(random_state=0, tol=1e-9, loss = 'squared_hinge', dual = True, C = 0.03)
clf.fit(X_train_final, y)
train_acc = (clf.predict(X_train_final) == y).mean()
print('Training set accuracy: {:.2f}%'.format(100*train_acc))

Training set accuracy: 86.27%


In [30]:
predictions = clf.predict(X_test)
predictions

array([0., 1., 0., ..., 0., 1., 0.])

In [45]:
df = pd.DataFrame((2*predictions-1).astype(int), columns = ['Prediction'], index = np.arange(1, len(predictions)+1))
df.index.names = ['Id']
df


Unnamed: 0_level_0,Prediction
Id,Unnamed: 1_level_1
1,-1
2,1
3,-1
4,1
5,-1
...,...
9996,1
9997,-1
9998,-1
9999,1


In [46]:
df.to_csv('submission_tfidf.csv')