# Wikipedia Dataset

In [None]:
from mediawiki_dump.dumps import WikipediaDump
from mediawiki_dump.reader import DumpReaderArticles
from mediawiki_dump.tokenizer import clean, tokenize

from datasets import Dataset

import csv

from tqdm import tqdm
import logging

logging.basicConfig(level=logging.INFO)

def wiki2csv(lang):
    dump = WikipediaDump(lang)
    pages = DumpReaderArticles().read(dump)
    
    with open(f"{lang}_wiki.csv", "w") as f:
        writer = csv.writer(f)
        writer.writerow(['id', 'title', 'content'])
        
        for page in tqdm(pages):
            writer.writerow([page.page_id, page.title, clean(page.content)])

## Turkish Wiki

In [None]:
wiki2csv("tr")

### PoS-Tagged Turkish Wiki

Using Wikipedia articles in csv file, a new pos-tagged wiki dataset is generated.

PoS Taggers:
1. 

# Forum Donanim Haber Dataset

521 url den 158 row çıktı.
536 url den 178 row çıktı.
1000 url den 232 row çıktı.

In [None]:
from bs4 import BeautifulSoup
import requests

import re
import csv
import random

from tqdm import tqdm
from IPython.display import clear_output

class DHCrawler:
    def __init__(self, base_site):
        self.data_dict = {}
        
        self.base_site = base_site
        self.n_row = 0
        
        self.visited_crawl = []
        self.unvisited_crawl = [base_site]
        
        self.link_list = []
                
    def __call__(self, filename):
        with open(f"{filename}.csv", "a") as f:
            writer = csv.writer(f)
            writer.writerow(['id', 'url', 'post', 'response'])
            
        while self.n_row < 10000:
            self._crawl_pages()
            self.crawl(filename=filename)
    
    def _crawl_pages(self):
        while len(self.link_list) < 1000:
            pop_index = random.randint(0, len(self.unvisited_crawl)-1)
            site = self.unvisited_crawl.pop(pop_index)

            self._get_pages(site)
            self.visited_crawl.append(site)
            
            clear_output(wait=True)
            print(f"Unvisited: {len(self.unvisited_crawl)} Visited: {len(self.visited_crawl)} Forum : {len(self.link_list)}")
            
    def _get_pages(self, site):
        req = requests.get(site)
        if req.status_code == 200:
            content = req.content
            soup = BeautifulSoup(content, 'html.parser')
            
            queue = soup.find_all(href=True)
            
            for el in queue:
                link = el['href']
                
                try:
                    if link[0] == '/':
                        link = self.base_site + link
                except:
                    pass
                    
                if self.base_site in link and link not in self.unvisited_crawl and link not in self.visited_crawl:
                    search_obj = re.search(r"https://forum.donanimhaber.com/\S*--f", link)
                
                    if search_obj:
                        self.unvisited_crawl.insert(0, link)
                    else:
                        self.unvisited_crawl.append(link)
                        
                    search_obj = re.search(r"https://forum.donanimhaber.com/\S*--\d{9}", link)
                    if search_obj and search_obj.span()[1] == len(link):
                        self.link_list.append(link)

    def crawl(self, filename):
        def crawl_single_page(url):
            req = requests.get(url)
            if req.status_code == 200:
                content = req.content
                soup = BeautifulSoup(content, 'html.parser')

                post = soup.find(class_='msg').get_text()
                resp = soup.find(id=re.compile('bestComment_\d+')).get_text()

                return {'post': post.strip(), 'response': resp.strip()}
        
        with open(f"{filename}.csv", "a") as f:
            writer = csv.writer(f)
        
            while len(self.link_list) > 0:
                curr_page = self.link_list.pop()

                try:
                    entry = crawl_single_page(curr_page)
                    post, response = entry['post'], entry['response']
                    
                    if post and len(post) >= 15 and response and len(response) >= 15:
                        i = int(curr_page[-9:])
                        writer.writerow([i, curr_page, post, response])
                        self.n_row += 1
                except:
                    pass
                
                clear_output(wait=True)
                print(f"Row: {self.n_row}  Remaining: {len(self.link_list)}")

dh = DHCrawler(base_site='https://forum.donanimhaber.com')
dh(filename='forum_dh2')

In [None]:
import sys
import csv

csv.field_size_limit(sys.maxsize)

def merge_files(*args):
    ids = []
    
    with open(args[-1], "w") as out_f:
        writer = csv.writer(out_f)
        writer.writerow(['id', 'url', 'post', 'response'])
        
        for arg in args[:-1]:
            with open(arg) as in_f:
                reader = csv.reader(in_f)
                
                for row in reader:
                    try:
                        int(row[0])
                        if row[0] not in ids:
                            ids.append(row[0])
                            writer.writerow(row)
                    except:
                        pass

#merge_files('forum_dh.csv', 'forum_dh1.csv', 'forum_dh2.csv', 'out.csv')


In [12]:
import pandas as pd
from random import randint

def random_except(a, b, e):
    while True:
        r = randint(a, b)
        if r not in e:
            return r

def generate_dataset(data_path, frac=0.75, repeat=None):
    df = pd.read_csv(data_path)

    true_pairs = df.sample(frac=frac)[['post', 'response']]
    true_pairs = true_pairs.assign(label=[1]*len(true_pairs))

    false_responses = df[~df.index.isin(true_pairs.index)]['response']

    if repeat is None:
        repeat = int(frac / (1-frac))

    false_pairs = pd.DataFrame(columns=['post', 'response', 'label'])

    for _ in range(repeat):
        for i, resp in false_responses.iteritems():
            rand_ind = random_except(0, len(df)-1, [i])
            post = df.post[rand_ind]
            generated_false = pd.DataFrame(data=[{'post': post, 'response': resp, 'label': 0}])

            false_pairs = pd.concat([false_pairs, generated_false], ignore_index=True)

    all_pairs = pd.concat([true_pairs, false_pairs], ignore_index=True)
    all_pairs = all_pairs.drop_duplicates()
    
    return all_pairs.sample(frac=1)

#true_pairs = generate_dataset('forum_dh.csv', frac=1, repeat=0)
#false_pairs = generate_dataset('forum_dh.csv', frac=0, repeat=1)

#balanced_dataset = pd.concat([true_pairs, false_pairs], ignore_index=True)
balanced_dataset = generate_dataset('forum_dh.csv', frac=.9)
balanced_dataset.to_parquet('post_resp_dataset_90.parquet')
balanced_dataset.describe()

Unnamed: 0,post,response,label
count,3188,3188,3188
unique,1699,1774,2
top,"Yayıncılığını Electronic Arts'ın yaptığı, oyun...",68F4EBC952C431952F5E4FCDEC52DB7B0CA550AD2D21E5...,1
freq,6,9,1598


In [13]:
import pandas as pd
df = pd.read_csv('post_resp_dataset_90.csv')
print(len(df))
df = df.drop_duplicates()
print(len(df))

3188
3188


# Dataset Analysis

In [6]:
import pandas as pd

csv_file = 'post_response_75_512.csv'
df = pd.read_csv(csv_file)
df.head()

Unnamed: 0,post,response,label
0,Kia önümüzdeki hafta Detroit Otomobil Fuarı'nd...,iyi görünüyor tabii boşline versiyonunu görmek...,1
1,"Selam dostlar, ""Ghostrunner"" yeni sürüm olduğu...",https://gameolog.net/gundem/ghostrunner-turkce...,1
2,"Pekin geçtiğimiz günlerde, Çin İnternet toplul...",Sabit bir şekilde 20-30 hız verseler her kulla...,1
3,Barış Murat YağcıAtakan ArslanAdem KılıçcıBatu...,@Alinda98 @Jankat @mr.yasoo @franz392,1
4,"Hyundai ve Kia, elektrikli otomobil sürücüleri...",Pil teknolojilerinde radikal bir gelişim olmad...,1


In [7]:
df.columns

Index(['post', 'response', 'label'], dtype='object')

In [8]:
df.shape

(1996, 3)

In [9]:
df.isna().sum()

post        0
response    0
label       0
dtype: int64

In [12]:
func = lambda x: len(x.split())  # or len

df['post_length'] = df['post'].apply(func)
df['response_length'] = df['response'].apply(func)

df['content'] = df['post'] + ' ' + df['response']
df['content_length'] = df['content'].apply(func)

df.head()

Unnamed: 0,post,response,label,post_length,response_length,content,content_length
0,Kia önümüzdeki hafta Detroit Otomobil Fuarı'nd...,iyi görünüyor tabii boşline versiyonunu görmek...,1,239,13,Kia önümüzdeki hafta Detroit Otomobil Fuarı'nd...,252
1,"Selam dostlar, ""Ghostrunner"" yeni sürüm olduğu...",https://gameolog.net/gundem/ghostrunner-turkce...,1,18,5,"Selam dostlar, ""Ghostrunner"" yeni sürüm olduğu...",23
2,"Pekin geçtiğimiz günlerde, Çin İnternet toplul...",Sabit bir şekilde 20-30 hız verseler her kulla...,1,171,26,"Pekin geçtiğimiz günlerde, Çin İnternet toplul...",197
3,Barış Murat YağcıAtakan ArslanAdem KılıçcıBatu...,@Alinda98 @Jankat @mr.yasoo @franz392,1,41,4,Barış Murat YağcıAtakan ArslanAdem KılıçcıBatu...,45
4,"Hyundai ve Kia, elektrikli otomobil sürücüleri...",Pil teknolojilerinde radikal bir gelişim olmad...,1,204,46,"Hyundai ve Kia, elektrikli otomobil sürücüleri...",250


In [13]:
df.describe()

Unnamed: 0,label,post_length,response_length,content_length
count,1996.0,1996.0,1996.0,1996.0
mean,0.512525,101.842685,29.662325,131.50501
std,0.499968,77.113992,36.489326,80.695644
min,0.0,1.0,2.0,4.0
25%,0.0,32.0,9.0,59.0
50%,1.0,88.0,18.0,126.0
75%,1.0,162.0,33.0,190.0
max,1.0,353.0,326.0,356.0


In [16]:
from sklearn.feature_extraction.text import CountVectorizer
from tqdm import tqdm

def find_most_frequents(col):
    vectorizer = CountVectorizer(lowercase=False)
    X = vectorizer.fit_transform(df[col])

    ls = []
    for k, v in tqdm(vectorizer.vocabulary_.items()):
        if len(ls) < 10:
            ls.append((k, v))
        else:
            idx = min([i for i, el in enumerate(ls)])
            if v > ls[idx][1]:
                ls[idx] = (k, v)

    ls.sort(key=lambda x: x[1], reverse=True)
    
    return ls

find_most_frequents(col='content')

100%|████████████████████████████████████████████████████████████| 44564/44564 [00:00<00:00, 437826.09it/s]


[('ᴴᴰ', 44563),
 ('örtüsünü', 43297),
 ('önümüzdeki', 43265),
 ('nda', 30532),
 ('kaldıracağı', 25835),
 ('hafta', 23223),
 ('Otomobil', 7950),
 ('Fuarı', 4917),
 ('Detroit', 3954),
 ('2018', 609)]