# Welcome 

## Import and Initialize 

In [None]:
import pandas as pd 
import json
import re
from nltk.corpus import wordnet as wn
from nltk.corpus import words as w
from nltk.corpus import stopwords 
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer 


import matplotlib.pyplot as plt
import numpy as np
from matplotlib import colors
from matplotlib.ticker import PercentFormatter
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')
%matplotlib inline

In [None]:
stops = stopwords.words("english")
words = w.words()
wordswn = wn.words()
ps = PorterStemmer()
lemmatizer = WordNetLemmatizer() 

In [None]:
input_file = "home/siyanew/ase/github_topic/data/final_readme_more_10_star.csv"
rules_directory = "home/siyanew/ase/github_topic/data/rules"

# Apply Function 😎️

In [None]:
def apply_csv(file_name, df):
    edit_list = {}
    with open(file_name) as file:
        for line in file.read().strip('\n').split('\n'):
            items = line.strip(',').split(',')
            edit_list[items[1].lower()] = list(map(lambda x:x.lower(),items[2:]))
    all_t = set()
    def make_topics(topics):
        result = []
        for t in topics.split(','):
            if t.strip() == '':
                continue
            if t in edit_list:
                all_t.add(edit_list[t][0])
                if edit_list[t][0] == '-1':
                    continue
                elif edit_list[t][0] == '-2':
                    result.append(t)
                    continue
                result.extend(edit_list[t])
            else:
                result.append(t)
        return ",".join(list(set(result)))
    
    df.topics = df.topics.apply(make_topics)
    df = df[df.topics!='']
    return df

In [None]:
def aggregate_topics(file_name, df):
    edit_list = []
    with open(file_name) as file:
        for line in file.read().strip('\n').split('\n'):
            items = line.strip(',').split(',')
            # edit_list.append(([ps.stem(x) for x in items[2:]],ps.stem(items[1])))
            edit_list.append(([x.lower().strip() for x in items[2:]],items[1].lower().strip()))
    all_t = set()
    def make_topics(topics):
        result = []
        deleted = []
        # topics = [ps.stem(x) for x in topics.split(',')]
        topics = [x for x in topics.split(',')]
        for abr in edit_list:
            if set(abr[0]) & set(topics) == set(abr[0]):
                all_t.add(abr[1])
                deleted.extend(list(abr[0]))
                result.append(abr[1])
        for del_topics in set(deleted):
            topics.remove(del_topics)
        result.extend(topics)
        return ",".join(list(set(result)))
    
    df.topics = df.topics.apply(make_topics)
    
    return df

In [None]:
def aggregate_two_topics(file_name, df):
    edit_list = {}
    with open(file_name) as file:
        for line in file.read().strip('\n').split('\n'):
            items = line.rstrip(',').split(',')
            # edit_list.append(([ps.stem(x) for x in items[2:]],ps.stem(items[1])))
            edit_list[items[2].lower().strip()] = items[1].lower().strip()
    all_t = set()
    def make_topics(topics):
        result = []
        topics = [x for x in topics.split(',')]
        for topic in topics:
            if topic in edit_list:
                all_t.add(topic)
                result.append(edit_list[topic])
            else:
                result.append(topic)
        return ",".join(list(set(result)))
    
    df.topics = df.topics.apply(make_topics)
    return df

In [None]:
def make_col(file_name, df,col,edit_list=None):
    if not edit_list:
        edit_list = []
        with open(file_name) as file:
            for line in file.read().strip('\n').split('\n'):
                item = line.strip(',').split(',')
                edit_list.append(item[0].lower().strip())
        
    def make_topics(topics):
        result = []
        topics = [x for x in topics.split(',')]
        for topic in topics:
            if topic in edit_list:
                result.append(topic)
        return ",".join(list(set(result)))
    
    df[col] = df.topics.apply(make_topics)
    return df

## Start Cleaning

In [None]:
df['original_topics'] = df.topics

In [None]:
df = df[df.stars>=10]

In [None]:
df = apply_csv(f'{rules_directory}/topics_contains_version.csv',df)

In [None]:
df = apply_csv(f'{rules_directory}/topics_contains_number.csv', df)

In [None]:
df = apply_csv('f'{rules_directory}/split_dash_topics.csv',df)

In [None]:
df = apply_csv(f'{rules_directory}/contains_top_topics.csv',df)

In [None]:
df = apply_csv(f'{rules_directory}/remove_plural_topics.csv', df)

In [None]:
df = apply_csv(f'{rules_directory}/contains_selected_topics.csv',df)

In [None]:
df = apply_csv(f'{rules_directory}/contractions.csv',df)

In [None]:
df = apply_csv(f'{rules_directory}/remove_stopwords_topic.csv',df)

In [None]:
df = apply_csv(f'{rules_directory}/remove_lemmatize_topic.csv',df)

In [None]:
df = aggregate_two_topics(f'{rules_directory}/replace.csv",df)

In [None]:
df = aggregate_topics(f'{rules_directory}/abbr.csv",df)

In [None]:
df = apply_csv(f'{rules_directory}/delete.csv',df)

## Remove low frequency topics 
Remove topics which have less frequnecy than threshold

In [None]:
def remove_low_freq_topics(file_name,threshold=20):
    _ = list(df.topics.map(lambda x: x.split(',')))
    _ = [i for s in _ for i in s]
    topics_list = set(_)
    topics_freq = {}
    def add_to_dict(x):
        if x in topics_freq:
            topics_freq[x] += 1
        else:
            topics_freq[x] = 1
    for x in _:
        add_to_dict(x)
    counter = 0
    topics_freq_list = list(topics_freq.items())
    with open(file_name,"w") as file:
        for t, _ in topics_freq_list:
            if _ < threshold:
                file.write(f"{counter},{t},-1,\n")
                counter += 1
    return file_name, counter

In [None]:
remove_low_freq_topics(f'{rules_directory}/low_freq_topics.csv',50)
df = apply_csv(f'{rules_directory}/low_freq_topics.csv',df)

## Remove topics contains version numbering in their name

Remove topics which contains version and numbers for example: ipv6, htmlv5

In [None]:
def remove_topics_contains_version(file_name):
    _ = list(df.topics.map(lambda x: x.split(',')))
    _ = [i for s in _ for i in s]
    topics_list = set(_)
    
    counter = 0
    with open(file_name,"w") as file:
        for t in topics_list:
            if re.search("v[\d\.]+",t):
                _ = re.sub(r'v[\d\.]+', '', t).strip('-').strip()
                if _ == '':
                    file.write(f'{t},-1,\n')
                else:
                    file.write(f"{t},{_.replace('--','-')},\n")
                counter += 1
    return file_name, counter


In [None]:
remove_topics_contains_version(f'{rules_directory}/topics_contains_version.csv')

## Remove numbers from topics


Remove numbers from topics which contains numbers for example: css3

In [None]:
def remove_topics_contains_digit(file_name):
    _ = list(df.topics.map(lambda x: x.split(',')))
    _ = [i for s in _ for i in s]
    topics_list = set(_)
    topics_freq = {}
    def add_to_dict(x):
        if x in topics_freq:
            topics_freq[x] += 1
        else:
            topics_freq[x] = 1
    for x in _:
        add_to_dict(x)
    counter = 0
    with open(file_name,"w") as file:
        has_number = lambda x: any(c.isdigit() for c in x)
        for t in topics_list:
            if has_number(t):
                _ = re.sub(r'\d+', '', t).strip('-').strip().replace('--','-')
                if _ == '':
                    file.write(f'{topics_freq[t]},{t},-1,\n')
                else:
                    file.write(f"{topics_freq[t]},{t},{_},\n")
                counter += 1
    return file_name, counter

In [None]:
remove_topics_contains_digit(f'{rules_directory}/topics_contains_number.csv')

## Remove plural topics


Remove s at the end of topics name: for example: it converts modules to module

In [None]:
def remove_plural_topics(file_name):
    _ = list(df.topics.map(lambda x: x.split(',')))
    _ = [i for s in _ for i in s]
    topics_list = set(_)
    topics_freq = {}
    def add_to_dict(x):
        if x in topics_freq:
            topics_freq[x] += 1
        else:
            topics_freq[x] = 1
    for x in _:
        add_to_dict(x)
    counter = 0
    with open(file_name,"w") as file:
        for t in topics_list:
            if t.endswith('s') and t[:-1] in topics_list:
                file.write(f"{topics_freq[t]},{t},{t[:-1]},{topics_freq[t[:-1]]},\n")
                counter += 1
    return file_name, counter

In [None]:
remove_plural_topics(f'{rules_directory}/remove_plural_topics.csv')

## Remove - and split topics

In [None]:
def split_dash_topics(file_name):
    _ = list(df.topics.map(lambda x: x.split(',')))
    _ = [i for s in _ for i in s]
    topics_list = set(_)
    topics_freq = {}
    def add_to_dict(x):
        if x in topics_freq:
            topics_freq[x] += 1
        else:
            topics_freq[x] = 1
    for x in _:
        add_to_dict(x)
    counter = 0
    with open(file_name,"w") as file:
        for t in topics_list:
            if '-' in t:
                file.write(f"{topics_freq[t]},{t},{','.join(t.split('-'))},\n")
                counter += 1
    return file_name, counter

In [None]:
split_dash_topics(f'{rules_directory}/split_dash_topics.csv')

## Remove topics that include top topics 

In [None]:
def remove_topics_contains_top_topics(file_name,threshold=200):
    _ = list(df.topics.map(lambda x: x.split(',')))
    _ = [i for s in _ for i in s]
    topics_list = set(_)
    topics_freq = {}
    def add_to_dict(x):
        if x in topics_freq:
            topics_freq[x] += 1
        else:
            topics_freq[x] = 1
    for x in _:
        add_to_dict(x)
    counter = 0
    topics_freq_list = list(topics_freq.items())
    topics_freq_list.sort(key=lambda x: x[1], reverse=True)
    with open(file_name,"w") as file:
        for top_topic,_ in topics_freq_list[:threshold]:
            for t,__ in topics_freq_list[threshold:]:
                if top_topic in t:
                    file.write(f"{topics_freq[t]},{t},{top_topic},{t.replace(top_topic,'').replace('--','-').strip('-')},\n")
                    counter += 1
    return file_name, counter

In [None]:
remove_topics_contains_top_topics(f'{rules_directory}/contains_top_topics.csv')

## Remove topics that include selected topics 

Remove selected names from topics for example : remove module from magento module

In [None]:
selected_topics = """
extension module  example  package provider plugin
manager development demo assistant config 
"""
# app cli

In [None]:
def remove_topics_contains_selected_topics(file_name,threshold=200):
    _ = list(df.topics.map(lambda x: x.split(',')))
    _ = [i for s in _ for i in s]
    topics_list = set(_)
    topics_freq = {}
    def add_to_dict(x):
        if x in topics_freq:
            topics_freq[x] += 1
        else:
            topics_freq[x] = 1
    for x in _:
        add_to_dict(x)
    counter = 0
    topics_freq_list = list(topics_freq.items())
    topics_freq_list.sort(key=lambda x: x[1], reverse=True)
    with open(file_name,"w") as file:
        for top_topic in selected_topics.split():
            for t,__ in topics_freq_list[threshold:]:
                if top_topic in t and t != top_topic:
                    file.write(f"{topics_freq[t]},{t},{top_topic},{t.replace(top_topic,'').replace('--','-').strip('-')},\n")
                    counter += 1
    return file_name, counter

In [None]:
remove_topics_contains_selected_topics(f'{rules_directory}/contains_selected_topics.csv')

## Stopwords topic

In [None]:
def remove_stopwords_topic(file_name):
    _ = list(df.topics.map(lambda x: x.split(',')))
    _ = [i for s in _ for i in s]
    topics_list = set(_)
    topics_freq = {}
    def add_to_dict(x):
        if x in topics_freq:
            topics_freq[x] += 1
        else:
            topics_freq[x] = 1
    for x in _:
        add_to_dict(x)
    counter = 0
    with open(file_name,"w") as file:
        for t in (set(_) & set(stops)):
            file.write(f'{topics_freq[t]},{t},-1,\n')
            counter += 1
    return file_name, counter
            

In [None]:
remove_stopwords_topic(f'{rules_directory}/remove_stopwords_topic.csv')

## Stems topic

In [None]:
def remove_stemmed_topic(file_name):
    _ = list(df.topics.map(lambda x: x.split(',')))
    _ = [i for s in _ for i in s]
    topics_list = set(_)
    topics_freq = {}
    def add_to_dict(x):
        if x in topics_freq:
            topics_freq[x] += 1
        else:
            topics_freq[x] = 1
    for x in _:
        add_to_dict(x)
    counter = 0
    with open(file_name,"w") as file:
        for t in topics_list:
            st = ps.stem(t)
            if st in topics_freq and t!=st:
                file.write(f'{topics_freq[t]},{t},{st},{topics_freq[st]},\n')
                counter += 1
    return file_name, counter
            

In [None]:
remove_stemmed_topic(f'{rules_directory}/remove_stemmed_topic.csv')

## Lemmatize

In [None]:
def remove_lemmatize_topic(file_name):
    _ = list(df.topics.map(lambda x: x.split(',')))
    _ = [i for s in _ for i in s]
    topics_list = set(_)
    topics_freq = {}
    def add_to_dict(x):
        if x in topics_freq:
            topics_freq[x] += 1
        else:
            topics_freq[x] = 1
    for x in _:
        add_to_dict(x)
    counter = 0
    with open(file_name,"w") as file:
        for t in topics_list:
            st = lemmatizer.lemmatize(t)
            if st != t:
                file.write(f'{topics_freq[t]},{t},{st},{topics_freq.get(st)},\n')
                counter += 1
    return file_name, counter

In [None]:
remove_lemmatize_topic(f'{rules_directory}/remove_lemmatize_topic.csv')