In [1]:
#prep data

In [2]:
import os
import json
from typing import Dict, List, Optional, Union, cast
import requests
import pandas as pd
import numpy as np
import bs4
import time

from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
import unicodedata
import re

import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, accuracy_score

from env import github_token, github_username

import acquire

In [3]:
df = pd.read_csv (r'f1_readmes.csv')

In [4]:
df.head()

Unnamed: 0,repo,language,readme_contents
0,ppatierno/formula1-telemetry-kafka,Java,# Formula 1 - Telemetry with Apache Kafka\n\nT...
1,NVIDIA-AI-IOT/Formula1Epoch,Makefile,[![logo.png](https://s2.postimg.org/fwiu26nmh/...
2,jcnewell/ergast-f1-api,PHP,# Ergast nodeJS API\nIn this period we are wor...
3,SOYJUN/FTP-implement-based-on-UDP,C,Copyright (c) 2014 Jun Zeng. <jun.zeng@stonybr...
4,daz/live-f1,C,live-f1 is a native Linux client for viewing t...


In [5]:
ADDITIONAL_STOPWORDS = ['r', 'u', '2', 'ltgt', '\n', 'ha']

def clean(readme_contents):
    
    'A simple function to cleanup text data'
    
    wnl = nltk.stem.WordNetLemmatizer()
    stopwords = nltk.corpus.stopwords.words('english') + ADDITIONAL_STOPWORDS
    readme_contents = (unicodedata.normalize('NFKD', readme_contents)
             .encode('ascii', 'ignore')
             .decode('utf-8', 'ignore')
             .lower())
    words = re.sub(r'[^\w\s]', '', readme_contents).split()
    return " ".join([wnl.lemmatize(word) for word in words if word not in stopwords])

In [6]:
null_readme = df[df['readme_contents'].isnull()].index

In [7]:
type(null_readme)

pandas.core.indexes.numeric.Int64Index

In [8]:
df.drop(null_readme , inplace=True)

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 268 entries, 0 to 325
Data columns (total 3 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   repo             268 non-null    object
 1   language         241 non-null    object
 2   readme_contents  268 non-null    object
dtypes: object(3)
memory usage: 8.4+ KB


In [10]:
df.readme_contents = df.readme_contents.apply(clean)

df.head()

Unnamed: 0,repo,language,readme_contents
0,ppatierno/formula1-telemetry-kafka,Java,formula 1 telemetry apache kafka project aim u...
1,NVIDIA-AI-IOT/Formula1Epoch,Makefile,logopnghttpss2postimgorgfwiu26nmhcopy_of_elect...
2,jcnewell/ergast-f1-api,PHP,ergast nodejs api period working improve serve...
3,SOYJUN/FTP-implement-based-on-UDP,C,copyright c 2014 jun zeng junzengstonybrookedu...
4,daz/live-f1,C,livef1 native linux client viewing official fo...


In [11]:
from textblob import TextBlob

In [18]:
text = TextBlob(df.readme_contents[2])

In [21]:
text.detect_language()

'en'

In [22]:
df.readme_contents[3]

'copyright c 2014 jun zeng junzengstonybrookedu zengjun0916gmailcom copyright c 2014 yigong wang yigwangcsstonybrookedu right reserved based steven code library unix network programming volume 1 third edition unrestricted right granted instructor student cse530 class fall 2014 stony brook university'

In [25]:
df['written_language'] = df['readme_contents'].apply(lambda content: TextBlob(content).detect_language())

In [26]:
df.head()

Unnamed: 0,repo,language,readme_contents,written_language
0,ppatierno/formula1-telemetry-kafka,Java,formula 1 telemetry apache kafka project aim u...,en
1,NVIDIA-AI-IOT/Formula1Epoch,Makefile,logopnghttpss2postimgorgfwiu26nmhcopy_of_elect...,en
2,jcnewell/ergast-f1-api,PHP,ergast nodejs api period working improve serve...,en
3,SOYJUN/FTP-implement-based-on-UDP,C,copyright c 2014 jun zeng junzengstonybrookedu...,en
4,daz/live-f1,C,livef1 native linux client viewing official fo...,en


In [27]:
df.written_language.value_counts()

en    225
pt     12
es      8
fr      6
gl      3
de      3
da      2
ro      2
vi      1
nl      1
pl      1
id      1
mg      1
rw      1
sl      1
Name: written_language, dtype: int64

### takeaways

- we have several readmes that are not in english and this might need to be considered when it comes to modeling
- 43 entries could be translated into english using googletrans