## 逻辑回归完成二分类，利用sklearn库完成tokenize

In [1]:
import sklearn
import pandas as pd
import re

df = pd.read_csv('train.csv')
# tweet = df.iloc[:,-1]
# label = df.iloc[:,1]
df.head()

Unnamed: 0,id,label,tweet
0,1,0,@ when a father is dysfunctional and is so se...
1,2,0,@ @ thanks for #lyft credit i can't use cause ...
2,3,0,bihday your majesty
3,4,0,#model i love u take with u all the time in ...
4,5,0,factsguide: society now #motivation


### 数据清洗

In [2]:
#using tweet-preprocessor to clean tweets
from string import punctuation
import preprocessor as p
for i in range(0,len(df)):
    df['tweet'][i] = ''.join([c for c in df['tweet'][i] if c not in punctuation])
    df['tweet'][i] = p.clean(df['tweet'][i])

In [3]:
df.head()

Unnamed: 0,id,label,tweet
0,1,0,when a father is dysfunctional and is so selfi...
1,2,0,thanks for lyft credit i cant use cause they d...
2,3,0,bihday your majesty
3,4,0,model i love u take with u all the time in ur
4,5,0,factsguide society now motivation


In [7]:
#数据集划分
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    df['tweet'], df['label'], test_size=0.3, shuffle=False)

In [8]:
#Tokenize
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(token_pattern=r'\b\w+\b')
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)


In [9]:
# Using Logistic Regression to classify
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
lr.fit(X_train,y_train)
predicted = lr.predict(X_test)

In [10]:
from sklearn.metrics import classification_report
print("Classification report for classifier %s:\n%s\n"
      % (lr, classification_report(y_test, predicted)))

Classification report for classifier LogisticRegression():
              precision    recall  f1-score   support

           0       0.96      0.99      0.98      8908
           1       0.85      0.47      0.61       681

    accuracy                           0.96      9589
   macro avg       0.91      0.73      0.79      9589
weighted avg       0.95      0.96      0.95      9589




In [11]:
def getLrModel():
    df = pd.read_csv('trainNew.csv')
    df.head()

    for i in range(0,len(df)):
        df['tweet'][i] = ''.join([c for c in df['tweet'][i] if c not in punctuation])
        df['tweet'][i] = p.clean(df['tweet'][i])

    X_train, X_test, y_train, y_test = train_test_split(
        df['tweet'], df['label'], test_size=0.3, shuffle=False)

    vectorizer = CountVectorizer(token_pattern=r'\b\w+\b')
    X_train = vectorizer.fit_transform(X_train)
    X_test = vectorizer.transform(X_test)
    lr = LogisticRegression()
    lr.fit(X_train,y_train)

    return [vectorizer,lr]

## 对于爬虫数据集进行处理

In [12]:
nyc = pd.read_csv('NewYork.csv')
nyc.head()

Unnamed: 0,id,conversation_id,created_at,date,time,timezone,user_id,username,name,place,...,geo,source,user_rt_id,user_rt,retweet_id,reply_to,retweet_date,translate,trans_src,trans_dest
0,1347332013104246785,1347332013104246785,2021-01-08 07:59:18 CST,2021-01-08,07:59:18,800,40357519,tjsalon,Timothy John's Salon,"{'type': 'Point', 'coordinates': [40.76459, -7...",...,"40.75773,-73.9857,1km",,,,,[],,,,
1,1347330120529412100,1347330120529412100,2021-01-08 07:51:47 CST,2021-01-08,07:51:47,800,1030110827154743297,glenn_biz,Kurt Glenn,,...,"40.75773,-73.9857,1km",,,,,[],,,,
2,1347326991691894785,1347326991691894785,2021-01-08 07:39:21 CST,2021-01-08,07:39:21,800,487187356,evauseb,"cardi b, jt, and meg areola stan account.",,...,"40.75773,-73.9857,1km",,,,,[],,,,
3,1347326056160763904,1347326056160763904,2021-01-08 07:35:38 CST,2021-01-08,07:35:38,800,50706690,511ny,511 New York,"{'type': 'Point', 'coordinates': [40.752998, -...",...,"40.75773,-73.9857,1km",,,,,[],,,,
4,1347324286902689792,1347324286902689792,2021-01-08 07:28:36 CST,2021-01-08,07:28:36,800,50706690,511ny,511 New York,"{'type': 'Point', 'coordinates': [40.762456, -...",...,"40.75773,-73.9857,1km",,,,,[],,,,


In [13]:
from string import punctuation
import preprocessor as p
for i in range(0,len(nyc)):
    nyc['tweet'][i] = ''.join([c for c in nyc['tweet'][i] if c not in punctuation])
    nyc['tweet'][i] = p.clean(nyc['tweet'][i])

vec,lrModel = getLrModel()


In [15]:
tweet = vec.transform(nyc['tweet'])
predict = lrModel.predict(tweet)

In [18]:
for i in range(0,len(nyc['tweet'])):
    if(predict[i]==1):
        print(nyc['tweet'][i])
        print()

Looks like a scene from the movie Scarface but no its our Capitol of the UnitedStates this is Maga not blacklivesmatter Capitol at Washington DC httpstcoCiGnywsax7

IBM is looking for teammates like you See our latest Marketing job openings including Client Solution Executive Business Process Outsourcing BPO via the link in our bio NewYork NY

Medcor is hiring in NewYork NY Read about our latest HealthWelfare job opening via the link in our bio COVID Compliance Officer RN NY amp NJ

Speaking for the Cannoli who have no voice We do not accept your labels or comparisons Be Kind Drink Espresso equality Wall Street Bull New York httpstcoNJQ57z1tlp

Have you ever had a role like Associate Partner Financial Services What did you like most about that job Banking NewYork NY

optimism by the Dapper Don berzinsky Theres just something about a draped bow tie in black and white by normallyaspirated scbocaraton happynewyear blacktie Rockafeller Center In Manhattan httpstcoYyGphuaNrM

I hope the New