# TFIDF-ml

# 1、data extract

In [1]:
import json

In [2]:
def prepare_data(filename):
    """
    args: 
        data: {user_id:{"text":text,"user":user,"place_id":place_id}}
     
    return:
        result: {text: [text1,text2,...],length:[length1,length2,...],label:[rumor or not,...] } 
    """
    with open(filename, 'r') as obj:
        for line in obj.readlines():
            data = json.loads(line)
    result = {}
    text = []
    textlabel = []
    length = []
    for user_id in data.keys():
        text.append(data[user_id]['text'])
        textlabel.append(data[user_id]['place_id']) 
        length.append(len(data[user_id]['text']))
    result['text'] = text
    result['length'] = length
    result['label'] = textlabel
    return result

In [3]:
file = './train_dev_data/0905_1005.txt'
train_dev = prepare_data(file)

In [4]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [5]:
train_dev_df = pd.DataFrame(train_dev)
train_dev_df.head()

Unnamed: 0,text,length,label
0,Just posted a photo @ West End https://t.co/se...,54,4
1,"Just posted a photo @ Caulfield North, Victori...",2384,3
2,WATERCOLOUR ART CLASSES ONLINE Thursday 9 Sep...,280,1
3,Just posted a photo @ Centennial Parklands htt...,1641,1
4,Just posted a photo @ Richmond Hill Angus http...,264,11


In [6]:
#split the train_dev dataset
from sklearn.model_selection import train_test_split
x_processed = train_dev_df['text']
y_processed = train_dev_df['label']
x_train,x_dev,y_train,y_dev = train_test_split(x_processed,y_processed,test_size = 0.3,stratify = y_processed,random_state = 22)

In [7]:
print(len(x_train))
print(len(x_dev))
print(y_train.head())

2132
915
121     1
1473    0
1741    4
2189    5
2666    1
Name: label, dtype: int64


In [8]:
print(len(x_train.iloc[:].values))

2132


In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import RidgeClassifier, LogisticRegression
from sklearn.metrics import f1_score
from sklearn import svm

In [10]:
# from sklearn.metrics import f1_score
#lr ngram 4,features 10000
from sklearn import svm

best_score = 0
best_clf = None
best_tfidf = None
for max_n in range(2,6):
    for features in range(10000,100000,10000):
        tfidf = TfidfVectorizer(ngram_range=(1, max_n), max_features=features).fit(x_train.iloc[:].values)
        train_tfidf = tfidf.transform(x_train.iloc[:].values)
        dev_tfidf = tfidf.transform(x_dev.iloc[:].values)
        clf = svm.SVC()
        clf.fit(train_tfidf, y_train.iloc[:].values)
        val_pred = clf.predict(dev_tfidf)
        score = f1_score(y_dev.iloc[:].values, val_pred, average='macro')
        print('max_n:',max_n,'features:',features,'score:%.4f'%score)
        if score > best_score:
            best_score = score
            best_clf = clf
            best_max_n = max_n
            best_features = features
print('best_n:',best_max_n,'best_features:',best_features,'best_score:%.4f'%best_score)


max_n: 2 features: 10000 score:0.3679
max_n: 2 features: 20000 score:0.3698
max_n: 2 features: 30000 score:0.3518
max_n: 2 features: 40000 score:0.3524
max_n: 2 features: 50000 score:0.3418
max_n: 2 features: 60000 score:0.3398
max_n: 2 features: 70000 score:0.3281
max_n: 2 features: 80000 score:0.3238
max_n: 2 features: 90000 score:0.3213
max_n: 3 features: 10000 score:0.3871
max_n: 3 features: 20000 score:0.3729
max_n: 3 features: 30000 score:0.3727
max_n: 3 features: 40000 score:0.3700
max_n: 3 features: 50000 score:0.3690
max_n: 3 features: 60000 score:0.3528
max_n: 3 features: 70000 score:0.3479
max_n: 3 features: 80000 score:0.3533
max_n: 3 features: 90000 score:0.3460
max_n: 4 features: 10000 score:0.3900
max_n: 4 features: 20000 score:0.3792
max_n: 4 features: 30000 score:0.3849
max_n: 4 features: 40000 score:0.3816
max_n: 4 features: 50000 score:0.3778
max_n: 4 features: 60000 score:0.3655
max_n: 4 features: 70000 score:0.3639
max_n: 4 features: 80000 score:0.3559
max_n: 4 fea

In [12]:
# from sklearn.metrics import f1_score
#lr ngram 4,features 10000
from sklearn import svm

best_score = 0
best_clf = None
best_tfidf = None
for max_n in range(5,6):
    for features in range(1000,20000,1000):
        tfidf = TfidfVectorizer(ngram_range=(1, max_n), max_features=features).fit(x_train.iloc[:].values)
        train_tfidf = tfidf.transform(x_train.iloc[:].values)
        dev_tfidf = tfidf.transform(x_dev.iloc[:].values)
        clf = svm.SVC()
        clf.fit(train_tfidf, y_train.iloc[:].values)
        val_pred = clf.predict(dev_tfidf)
        score = f1_score(y_dev.iloc[:].values, val_pred, average='macro')
        print('max_n:',max_n,'features:',features,'score:%.4f'%score)
        if score > best_score:
            best_score = score
            best_clf = clf
            best_max_n = max_n
            best_features = features
print('best_n:',best_max_n,'best_features:',best_features,'best_score:%.4f'%best_score)


max_n: 5 features: 1000 score:0.4057
max_n: 5 features: 2000 score:0.3764
max_n: 5 features: 3000 score:0.3831
max_n: 5 features: 4000 score:0.4003
max_n: 5 features: 5000 score:0.4122
max_n: 5 features: 6000 score:0.3819
max_n: 5 features: 7000 score:0.3868
max_n: 5 features: 8000 score:0.3915
max_n: 5 features: 9000 score:0.3855
max_n: 5 features: 10000 score:0.3937
max_n: 5 features: 11000 score:0.3946
max_n: 5 features: 12000 score:0.3960
max_n: 5 features: 13000 score:0.3871
max_n: 5 features: 14000 score:0.3785
max_n: 5 features: 15000 score:0.3771
max_n: 5 features: 16000 score:0.3793
max_n: 5 features: 17000 score:0.3776
max_n: 5 features: 18000 score:0.3810
max_n: 5 features: 19000 score:0.3808
best_n: 5 best_features: 5000 best_score:0.4122


In [13]:
print('best_n:',best_max_n,'best_features:',best_features,'best_score:%.4f'%best_score)

best_n: 5 best_features: 5000 best_score:0.4122


In [14]:
import joblib
# store the model
joblib.dump(best_clf, 'tfidf-svm_ngram{}_features{}_score{}.pkl'.format(best_max_n,best_features,best_score))

['tfidf-svm_ngram5_features5000_score0.41220183085122697.pkl']

In [17]:
best_max_n = 5
best_features = 5000

In [19]:
from sklearn import metrics
import numpy as np
tfidf = TfidfVectorizer(ngram_range=(1,best_max_n+1), max_features=best_features).fit(x_train.iloc[:].values)
train_tfidf = tfidf.transform(x_train.iloc[:].values)
dev_tfidf = tfidf.transform(x_dev.iloc[:].values)

clf = svm.SVC()
clf.fit(train_tfidf, y_train.iloc[:].values)

val_pred = clf.predict(dev_tfidf)
score = f1_score(y_dev.iloc[:].values, val_pred, average='macro')

print('best_score:%.4f'%score)
print(metrics.classification_report(y_dev.iloc[:].values,val_pred))

best_score:0.4087
              precision    recall  f1-score   support

          -1       1.00      0.10      0.18        20
           0       0.76      0.23      0.36        69
           1       0.40      0.88      0.55       214
           2       0.33      0.03      0.05        38
           3       0.60      0.64      0.62       187
           4       0.73      0.34      0.47        79
           5       0.68      0.49      0.57        95
           6       1.00      0.08      0.14        13
           7       0.66      0.37      0.48        51
           8       0.69      0.42      0.52        60
           9       1.00      0.14      0.24        22
          10       0.67      0.29      0.40         7
          11       0.80      0.36      0.50        11
          12       0.65      0.50      0.56        22
          13       1.00      0.50      0.67         6
          14       0.91      0.50      0.65        20
          15       0.00      0.00      0.00         1

    accu

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [20]:
#prepare for the test data
def prepare_test(filename):
    """
    args: 
        data: {user_id:{"text":text,"user":user,"place_id":place_id}}
     
    return:
        result: {text: [text1,text2,...],length:[length1,length2,...],label:[rumor or not,...] } 
    """
    with open(filename, 'r') as obj:
        for line in obj.readlines():
            data = json.loads(line)
    result = {}
    text = []
    location = []
    length = []
    user_ids = []
    for user_id in data.keys():
        user_ids.append(user_id)
        text.append(data[user_id]['text'])
        length.append(len(data[user_id]['text']))
        location.append(data[user_id]['location'])
    result['user_id'] = user_ids
    result['text'] = text
    result['length'] = length
    result['location'] = location
    return result

In [21]:
file = './test_data/0905.txt'
test = prepare_test(file)

In [22]:
test_df = pd.DataFrame(test)
test_df.head()

Unnamed: 0,user_id,text,length,location
0,276090111,@aVoice2bHrd You know how I am 😭I cut family o...,138,West Coast
1,1499908182,@AusAndy Most people probably have one now fro...,187,"Sydney, New South Wales"
2,1952211163,@anassilvvaa Bora@anassilvvaa Se envolver dinh...,1948,Invicta
3,788532504626012160,@60Mins Will this program be replayed?Is last ...,1284,"Newcastle, New South Wales"
4,1459159765,Breaking news: emergency COVID laws applying t...,133,Fortitude Valley


In [23]:
from sklearn import metrics
import numpy as np
tfidf = TfidfVectorizer(ngram_range=(1,best_max_n+1), max_features=best_features).fit(x_train.iloc[:].values)
# train_tfidf = tfidf.transform(x_train.iloc[:].values)
# dev_tfidf = tfidf.transform(x_dev.iloc[:].values)
test_tfidf = tfidf.transform(test_df['text'].iloc[:].values)

clf = svm.SVC()
clf.fit(train_tfidf, y_train.iloc[:].values)

val_pred = clf.predict(test_tfidf)
print(val_pred)

[1 3 5 ... 3 1 1]


In [24]:
#construct the test output file
test_df['predict_label'] = val_pred
test_df.head()

Unnamed: 0,user_id,text,length,location,predict_label
0,276090111,@aVoice2bHrd You know how I am 😭I cut family o...,138,West Coast,1
1,1499908182,@AusAndy Most people probably have one now fro...,187,"Sydney, New South Wales",3
2,1952211163,@anassilvvaa Bora@anassilvvaa Se envolver dinh...,1948,Invicta,5
3,788532504626012160,@60Mins Will this program be replayed?Is last ...,1284,"Newcastle, New South Wales",3
4,1459159765,Breaking news: emergency COVID laws applying t...,133,Fortitude Valley,1


In [35]:
## write predictions to json
# from collection import OrderedDict
def write2json(filename,dataframe):
    """
    args: 
        filename: the filename of the predicted data label file
        dataframe: the dataframe of the predicted data
    return:
        None
    """
    file_cnt = 0
    new_dict = {}
    with open(filename, 'w') as file:
        
        for index,row in dataframe.iterrows():
            new_dict['id'] = row['user_id']
            new_dict['place_id'] = row['predict_label']
        
            json_line = json.dumps(new_dict)
            file.write(json_line+'\n')
            file_cnt += 1
            if file_cnt % 100 == 0:
                print('file:'+str(file_cnt))
    return None

In [36]:
write2json('./ml_predict/svm/svm_0905_v1.json',test_df)

file:100
file:200
file:300
file:400
file:500
file:600
file:700
file:800
file:900
file:1000
file:1100
file:1200
file:1300
file:1400
file:1500
file:1600
file:1700
file:1800
file:1900
file:2000
file:2100
file:2200
file:2300
file:2400
file:2500
file:2600
file:2700
file:2800
file:2900
file:3000
file:3100
file:3200
file:3300
file:3400
file:3500
file:3600
file:3700
file:3800
file:3900
file:4000
file:4100
file:4200
file:4300
file:4400
file:4500
file:4600
file:4700
file:4800
file:4900
file:5000
file:5100
file:5200
file:5300
file:5400
file:5500
file:5600
file:5700
file:5800
file:5900
file:6000
file:6100
file:6200
file:6300
file:6400
file:6500
file:6600
file:6700
file:6800
file:6900
file:7000
file:7100
file:7200
file:7300
file:7400
file:7500
file:7600
file:7700
file:7800
file:7900
file:8000
file:8100
file:8200
file:8300
file:8400
file:8500
file:8600
file:8700
file:8800
file:8900
file:9000
file:9100
file:9200
file:9300
file:9400
file:9500
file:9600
file:9700
file:9800
file:9900
file:10000
file:101