In [None]:
!pip install twint

In [None]:
!pip install --user --upgrade -e git+https://github.com/twintproject/twint.git@origin/master#egg=twint

In [None]:
!pip install nest_asyncio

In [None]:
import twint
import nest_asyncio
nest_asyncio.apply()
import pandas as pd
from os import mkdir, path
import datetime
from datetime import timezone
import re

def twint_clean(word, num=False, eng=False):
    #remove special characters
    clean_data = re.sub('[^0-9a-zA-Zㄱ-ㅎ가-힣]','',word) 
    #remove hashtags
    clean_data = re.sub('[#]+[0-9a-zA-Z_]+','',  clean_data)
    #remove tweet mentions
    clean_data = re.sub('@[\w_]+','', clean_data)
    #remove URL
    clean_data = re.sub(r"http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+", ' ', word)
    clean_data = re.sub(r"[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{2,6}\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*)", ' ', word)
    #remove retweets
    clean_data = re.sub('RT @[\w_]+: ', '', clean_data)
    #remove newline
    clean_data = clean_data.replace('\n',' ')
    clean_data = re.sub('[&]+[a-z]+', ' ', clean_data)
    
    if num is True:
        clean_data = re.sub(r'\d+','', clean_data)
        
    if eng is True:
        clean_data = re.sub('[a-zA-Z]','', clean_data)
        
    return clean_data

def twint_setting(directory, s_keyword, since, until, output_name, limit):
    
    c = twint.Config()
    
    c.Limit = limit
    c.Search = s_keyword
    c.Since = since
    c.Until = until
    c.Store_csv = True
    c.Output = output_name
    c.Hide_output = True
    c.Debug = True
    c.Rename = f'{directory}/save_endpoint/save_endpoint_{since}.txt'
    c.Popular_tweets = True
    
    try:
        twint.run.Search(c)
    except (KeyboardInterrupt, SystemExit):
        raise
    except:
        print(f"Problem with {since}.")
              
def twint_crawl(s_keyword, since, until, limit=2):
    
    directory = twint_clean(s_keyword)
              
    try:
        mkdir(directory)
        mkdir(f'{directory}/save_endpoint')
        print("Directory", directory, "created")
    except FileExistsError:
        print("Directory", directory, "already exists")
              
    daterange = pd.date_range(since, until)
              
    for start_date in daterange:
        
        since = start_date.strftime("%Y-%m-%d")
        until = (start_date + datetime.timedelta(days=1)).strftime("%Y-%m-%d")
        
        output_name = "".join(since.split("-"))+".csv"
        output_name = path.join(directory, output_name)
        
        print(f'Getting {since}')
        twint_setting(directory, s_keyword, since, until, output_name, limit)
        
Keyword = "코로나"
twint_crawl(Keyword, '2021-04-15', '2021-04-17', limit = 25)


In [None]:
from pathlib import Path
from tqdm.auto import tqdm
import os

DATA_DIR = Path(f"./{twint_clean(Keyword)}")
csv_file = [pos_csv for pos_csv in os.listdir(DATA_DIR) if pos_csv.endswith('.csv')]

df_list =[]
for f_name in tqdm(csv_file):
    temp_df = pd.read_csv(DATA_DIR / f_name)
    df_list.append(temp_df)

df = pd.concat(df_list, sort = False)
df[["date", "time", "username", "tweet", "retweets_count"]].tail()

In [None]:
import site; site.getsitepackages()

In [None]:
from konlpy.tag import Mecab

mecab = Mecab(dicpath = 'C:\mecab\mecab-ko-dic')

def preprecessing_mecab(readData):
    
    sentence = twint_clean(readData)
    
    morphs = mecab.pos(sentence)
    
    JOSA = ["JKS", "JKC", "JKG", "JKO", "JKB", "JKV", "JKQ", "JX", "JC"]
    SIGN = ["SF", "SE", "SSO", "SSC", "SC", "SY"]
    TERMINATION = ["EP", "EF", "EC", "ETN", "ETM"]
    SUPPORT_VERB = ["VX"]
    NUMBER = ["SN"]
    
    morphs[:] = (morph for morph in morphs if morph[1] not in JOSA+SIGN+TERMINATION+SUPPORT_VERB)
    
    morphs[:] = (morph for morph in morphs if not (len(morph[0])==1))
    
    morphs[:] = (morph for morph in morphs if morph[1] not in NUMBER)
    
    result = []
    for morph in morphs:
        result.append(morph[0])
    
    return result


SAMPLE_TEXT ="<이건 또 무슨?? [단독]질병청, 코로나19 백신 맞고 사망해도 줄 돈 없다 文정부, 사망보상금 4억3000만원 약속했지만 보상금 지급할 질병청 예산 4억5000만원뿐 https://m.news.nate.com/view/20210422n02770?sect=sisa&list=rank&cate=interest>"

preprecessing_mecab(SAMPLE_TEXT)