In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.color_palette("tab10")
from scipy import stats
from sklearn.model_selection import train_test_split
import os
from langdetect import detect
seed = 1349



In [2]:
def detect_language(text):
    try:
        return detect(text)
    except:
        return None

In [3]:
def acquire():
    '''
    Obtains the vanilla version of both the red and white wine dataframe
    INPUT:
    NONE
    OUTPUT:
    red = pandas dataframe with red wine data
    white = pandas dataframe with white wine data
    '''
    r = pd.read_csv('langr_raw.csv')
    python = pd.read_csv('langp_raw.csv')
    return r, python


In [4]:
r, python = acquire()

In [5]:
r

Unnamed: 0.1,Unnamed: 0,repo,language,readme_contents
0,0,ujjwalkarn/DataScienceR,R,# R Data Science Tutorials\n- This repo contai...
1,1,IndrajeetPatil/ggstatsplot,R,---\noutput: github_document\n---\n\n <!-- RE...
2,2,easystats/easystats,R,"---\noutput: github_document\n---\n\n```{r, wa..."
3,3,seandavi/sars2pack,R,---\noutput: \n - rmarkdown::github_document\...
4,4,markvanderloo/lumberjack,R,## Track changes in data\n[![Build Status](htt...
...,...,...,...,...
170,170,leenock/ML,R,��#
171,171,Phelipe-Sempreboni/data-science,R,# Data Science\n\n---\n\n### Esse repositório ...
172,172,fernandapilat/r-para-data-science,R,# R para Data Science\n\nRepositório criado pa...
173,173,lcasilva/Projeto_DSA_Feedback2,R,# Projeto_Feedback2\nProjeto Feedback 2: Machi...


In [6]:
python

Unnamed: 0.1,Unnamed: 0,repo,language,readme_contents
0,0,ludwig-ai/ludwig,Python,![Ludwig logo](https://github.com/ludwig-ai/lu...
1,1,modin-project/modin,Python,"<p align=""center""><a href=""https://modin.readt..."
2,2,Netflix/metaflow,Python,![Metaflow_Logo_Horizontal_FullColor_Ribbon_Da...
3,3,lk-geimfari/mimesis,Python,Mimesis: The Fake Data Generator\n------------...
4,4,holoviz/panel,Python,"<a href=""https://panel.holoviz.org/"">\n <pict..."
...,...,...,...,...
175,175,rahulbordoloi/Machine-Learning,Python,# Machine-Learning\nContains my Glossary for M...
176,176,Oslandia/QDeeplandia,Python,QGIS plugin dedicated to 2D semantic segmentat...
177,177,arjunbhasin2013/BaselinePredictionMLFramework,Python,# Baseline Prediction Machine Learning Framewo...
178,178,PonderaLab/datascience4economists,Python,"# Datascience4economists\n\nFirst of all, welc..."


In [14]:
def prepare_mvp():
    '''
    Takes in the vanilla red and white wine dataframes and returns a cleaned version that is ready 
    for exploration and further analysis
    INPUT:
    NONE
    OUTPUT:
    wines = pandas dataframe with both red and white wine prepped for exploration
    '''
    r, python = acquire()
    df = pd.concat([r, python], ignore_index = True)
    df = df.drop('Unnamed: 0', axis=1)
    df = df.drop_duplicates()
    df['readme_language'] = df['readme_contents'].apply(detect_language)
    df = df[df['readme_language'] == 'en']
    df = df.drop('readme_language', axis=1)
    return df

In [15]:
df = prepare_mvp()

In [16]:
df

Unnamed: 0,repo,language,readme_contents
0,ujjwalkarn/DataScienceR,R,# R Data Science Tutorials\n- This repo contai...
1,IndrajeetPatil/ggstatsplot,R,---\noutput: github_document\n---\n\n <!-- RE...
2,easystats/easystats,R,"---\noutput: github_document\n---\n\n```{r, wa..."
3,seandavi/sars2pack,R,---\noutput: \n - rmarkdown::github_document\...
4,markvanderloo/lumberjack,R,## Track changes in data\n[![Build Status](htt...
...,...,...,...
350,rahulbordoloi/Machine-Learning,Python,# Machine-Learning\nContains my Glossary for M...
351,Oslandia/QDeeplandia,Python,QGIS plugin dedicated to 2D semantic segmentat...
352,arjunbhasin2013/BaselinePredictionMLFramework,Python,# Baseline Prediction Machine Learning Framewo...
353,PonderaLab/datascience4economists,Python,"# Datascience4economists\n\nFirst of all, welc..."


In [17]:
df.to_csv('language_data.csv', index=False)

In [18]:
df

Unnamed: 0,repo,language,readme_contents
0,ujjwalkarn/DataScienceR,R,# R Data Science Tutorials\n- This repo contai...
1,IndrajeetPatil/ggstatsplot,R,---\noutput: github_document\n---\n\n <!-- RE...
2,easystats/easystats,R,"---\noutput: github_document\n---\n\n```{r, wa..."
3,seandavi/sars2pack,R,---\noutput: \n - rmarkdown::github_document\...
4,markvanderloo/lumberjack,R,## Track changes in data\n[![Build Status](htt...
...,...,...,...
350,rahulbordoloi/Machine-Learning,Python,# Machine-Learning\nContains my Glossary for M...
351,Oslandia/QDeeplandia,Python,QGIS plugin dedicated to 2D semantic segmentat...
352,arjunbhasin2013/BaselinePredictionMLFramework,Python,# Baseline Prediction Machine Learning Framewo...
353,PonderaLab/datascience4economists,Python,"# Datascience4economists\n\nFirst of all, welc..."
