In [1]:
import re
from tqdm import tqdm

import pandas as pd

import jieba.analyse as analyse

from nltk import pos_tag
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

pd.set_option('display.max_columns',None)
tqdm.pandas(desc='')

### 读取文件，提取月份

In [2]:
df = pd.read_csv('./data/Covid_19_Twitter/Twitter.csv' ,lineterminator="\n")
df['month'] = df['created_at'].progress_apply(lambda x: x.split(' ')[0][:7])
df['day'] = df['created_at'].progress_apply(lambda x: x.split(' ')[0].split('-')[-1])
df.rename(columns={'text': 'content'}, inplace=True)
df = df[['month', 'day', 'content']]
df.to_csv('./data/Twitter_src.csv', index=False)
df.head()

100%|██████████████████████████████| 791771/791771 [00:00<00:00, 1781421.78it/s]
100%|██████████████████████████████| 791771/791771 [00:00<00:00, 1643148.64it/s]


Unnamed: 0,month,day,content
0,2020-05,21,Here is some uplifting news and positive figur...
1,2020-05,28,This needs to be watched! Republicans covered ...
2,2020-05,26,@piersmorgan People are not interested in this...
3,2020-05,27,Omg. He’s such an embarrassment to your countr...
4,2020-05,29,"With your help, local artists, musicians, and ..."


### 筛选英文

In [3]:
def format_str(content):
    content = re.sub(r'[^a-zA-Z]', ' ', content)
    content = ' '.join(content.split(' '))
    return content.lower()

df['content_format'] = df['content'].progress_apply(lambda x: format_str(x))
df.to_csv('./data/Twitter_src.csv', index=False)
df.head()

100%|███████████████████████████████| 791771/791771 [00:04<00:00, 185045.76it/s]


Unnamed: 0,month,day,content,content_format
0,2020-05,21,Here is some uplifting news and positive figur...,here is some uplifting news and positive figur...
1,2020-05,28,This needs to be watched! Republicans covered ...,this needs to be watched republicans covered ...
2,2020-05,26,@piersmorgan People are not interested in this...,piersmorgan people are not interested in this...
3,2020-05,27,Omg. He’s such an embarrassment to your countr...,omg he s such an embarrassment to your countr...
4,2020-05,29,"With your help, local artists, musicians, and ...",with your help local artists musicians and ...


### 筛选单词，筛选名词，并还原词形

In [4]:
def sent2word(content):
    flaglist = ['NN', 'NNS', 'NNP', 'NNPS']
    
    lemmatizer=WordNetLemmatizer()
    content=word_tokenize(content)
    postag = pos_tag(content)
    
    content = ''
    for word, tag in postag:
        if tag in flaglist:
            word = lemmatizer.lemmatize(word)
            if len(word) > 3 and wordnet.synsets(word):
                content += ' ' + word

    return content

df['content_cut'] = df['content_format'].progress_apply(lambda x: sent2word(x))
df.to_csv('./data/Twitter_src.csv', index=False)
df.head()

100%|█████████████████████████████████| 791771/791771 [08:13<00:00, 1604.82it/s]


Unnamed: 0,month,day,content,content_format,content_cut
0,2020-05,21,Here is some uplifting news and positive figur...,here is some uplifting news and positive figur...,news figure afternoon http
1,2020-05,28,This needs to be watched! Republicans covered ...,this needs to be watched republicans covered ...,republican status member http
2,2020-05,26,@piersmorgan People are not interested in this...,piersmorgan people are not interested in this...,people shit thing pandemic
3,2020-05,27,Omg. He’s such an embarrassment to your countr...,omg he s such an embarrassment to your countr...,embarrassment country
4,2020-05,29,"With your help, local artists, musicians, and ...",with your help local artists musicians and ...,help artist musician


### 写入停用词

In [5]:
stop_words=['http', 'news', 'case', 'people', 'para', 'year', 'time', 'today',
            'week', 'death', 'test', 'time', 'virus', 'report', 'number', 'month',
            'corona', 'record', 'thank', 'please', 'part', 'thanks', 'hour', 'article', 
            'season', 'medium','november', 'update', 'october', 'nothing', 'virus', 
            'july', 'august', 'june', 'january', 'february', 'march', 'april', 'may', 
            'june', 'july', 'august', 'september', 'october', 'november', 'december'
]

stop_words = list(set(stop_words))

with open('./data/StopWords_EN.txt', 'w') as file:
    for stop_word in stop_words:   
        file.write(stop_word+'\n')

### 读取停用词表

In [6]:
with open('./data/StopWords_EN.txt',encoding='utf-8') as file:
    stop_words = list(set(file.read().split('\n')))

### 去除停用词

In [7]:
def delStopWord(content, stop_words):  
    segList = content.split(' ')
    string = ''
    for seg in segList:
        if seg in stop_words:
            continue
        else:
            string += ' '+seg
    return string

df['content_cut_stop'] = df['content_cut'].progress_apply(lambda x: delStopWord(x, stop_words))
df.to_csv('./data/Twitter_src.csv', index=False)
df.head()

100%|███████████████████████████████| 791771/791771 [00:01<00:00, 605395.13it/s]


Unnamed: 0,month,day,content,content_format,content_cut,content_cut_stop
0,2020-05,21,Here is some uplifting news and positive figur...,here is some uplifting news and positive figur...,news figure afternoon http,figure afternoon
1,2020-05,28,This needs to be watched! Republicans covered ...,this needs to be watched republicans covered ...,republican status member http,republican status member
2,2020-05,26,@piersmorgan People are not interested in this...,piersmorgan people are not interested in this...,people shit thing pandemic,shit thing pandemic
3,2020-05,27,Omg. He’s such an embarrassment to your countr...,omg he s such an embarrassment to your countr...,embarrassment country,embarrassment country
4,2020-05,29,"With your help, local artists, musicians, and ...",with your help local artists musicians and ...,help artist musician,help artist musician


### 保存文件

In [8]:
df = pd.read_csv('./data/Twitter_src.csv',lineterminator="\n")
df = df[['month', 'day', 'content_cut_stop']]
df.rename(columns={'content_cut_stop': 'content'}, inplace=True)
df.sort_values('month', inplace=True)
df.dropna(subset=['content'], inplace=True)
df.to_csv('./data/Twitter.csv', index=False)