# Project Task 1
## Downloading Python Repositories

Extra Credit - Downloading the following python repositories in an automated way using Python

1. matplotlib: A plotting library

2. scikit-learn: A machine learning library

3. numpy: A scientific computing library

4. pandas: A columnar data analysis library

5. django: A web framework

6. scipy: A scientific computing library

7. flask: A micro web framework

8. requests: A HTTP requests library

9. ansible: An IT automation platform

10. sentry: A crash reporting utility

11. scrapy: A web scraper

12. Mailpile: An email client

13. sshuttle: A proxy server

14. salt: An IT automation platform

15. NewsBlur: A newsreader

16. beets: A music library manager

In [1]:
import os

# Create the 'Python Repositories' directory if it doesn't already exist
if not os.path.exists('Python Repositories'):
    os.makedirs('Python Repositories')

import subprocess

# List of regular link repositories to download
repos = ['matplotlib', 'scikit-learn', 'numpy', 'django', 'scipy', 'ansible', 'scrapy', 'Mailpile', 'sshuttle']

# Clone each repository into the 'Python Repositories' directory
for repo in repos:
    # Checking if the repository already exists
    if not os.path.exists(f'Python Repositories/{repo}'):
        subprocess.run(['git', 'clone', f'https://github.com/{repo}/{repo}.git', f'Python Repositories/{repo}'])
    else:
        print(f'Python Repositories/{repo} already exists')

# List of non regular link repositories to download
additional_repos = ['https://github.com/pandas-dev/pandas', 'https://github.com/pallets/flask', 'https://github.com/psf/requests', 'https://github.com/getsentry/sentry', 'https://github.com/saltstack/salt', 'https://github.com/beetbox/beets']

# Clone each repository into the 'Python Repositories' directory
for repo in additional_repos:
    # Checking if the repository already exists
    if not os.path.exists(f'Python Repositories/{repo.split("/")[-1]}'):
        subprocess.run(['git', 'clone', repo, f'Python Repositories/{repo.split("/")[-1]}'])
    else:
        print(f'Python Repositories/{repo.split("/")[-1]} already exists')


Python Repositories/matplotlib already exists
Python Repositories/scikit-learn already exists
Python Repositories/numpy already exists
Python Repositories/django already exists
Python Repositories/scipy already exists
Python Repositories/ansible already exists
Python Repositories/scrapy already exists
Python Repositories/Mailpile already exists
Python Repositories/sshuttle already exists
Python Repositories/pandas already exists
Python Repositories/flask already exists
Python Repositories/requests already exists
Python Repositories/sentry already exists
Python Repositories/salt already exists
Python Repositories/beets already exists


In [2]:
# Using the python_crawler.py file to crawl through the repositories looking for Python files
import python_crawler

# Declaring the path of the 'Python Repositories' directory
path = 'Python Repositories'

# Creating an object of the PythonCrawler class
crawler = python_crawler.PythonCrawler(path)

# Crawl through the repositories
crawler.crawl()

# Print the paths of all the .py files
crawler.print_py_files()

# Aggregate all the py files in a single file
crawler.aggregate_py_files()

Python Repositories/flask/tests/test_basic.py
Python Repositories/flask/tests/conftest.py
Python Repositories/flask/tests/test_converters.py
Python Repositories/flask/tests/test_logging.py
Python Repositories/flask/tests/test_signals.py
Python Repositories/flask/tests/test_async.py
Python Repositories/flask/tests/test_session_interface.py
Python Repositories/flask/tests/test_instance_config.py
Python Repositories/flask/tests/test_views.py
Python Repositories/flask/tests/test_json_tag.py
Python Repositories/flask/tests/test_subclassing.py
Python Repositories/flask/tests/test_reqctx.py
Python Repositories/flask/tests/test_blueprints.py
Python Repositories/flask/tests/test_config.py
Python Repositories/flask/tests/test_user_error_handler.py
Python Repositories/flask/tests/test_helpers.py
Python Repositories/flask/tests/test_json.py
Python Repositories/flask/tests/test_cli.py
Python Repositories/flask/tests/test_templating.py
Python Repositories/flask/tests/test_appctx.py
Python Repositori

In [3]:
# Importing the required libraries
import pandas as pd
import numpy as np
import gensim
import os
import string
from nltk.tokenize import RegexpTokenizer
from nltk import sent_tokenize
from gensim.utils import simple_preprocess

code = []

with open('aggregate_py_files.py', 'r') as f:
        corpus = f.read()
        raw_sent = sent_tokenize(corpus)
        for sent in raw_sent:
            code.append(simple_preprocess(sent))

# Printing the number of lines of code and the number of tokens (words) in the file
print(f'Number of lines of code: {len(code)}')
print(f'Number of tokens (words): {len([token for sent in code for token in sent])}')

Number of lines of code: 308317
Number of tokens (words): 14366189


In [4]:
# Train your Gensim Word2Vec model with the tokenized lines of code
model = gensim.models.Word2Vec(
    window=10,
    min_count=2
)

In [5]:
model.build_vocab(code)

In [6]:
model.train(code, total_examples=model.corpus_count, epochs=model.epochs)

(60864404, 71830945)

In [7]:
# Deleting the old model if it exists

if os.path.exists('python2vec.model'):
    os.remove('python2vec.model')

# Saving the model as python2vec.model

model.save('python2vec.model')

In [8]:
# Explore the trained model by examining the closest_words to some Python keywords like “for” and “if.”  Also explore the similarity of some popular identifier names like “math” and “numpy” in your notebook.

# Explore the closest words to "for"
print(model.wv.most_similar('for'))

# Explore the closest words to "if"
print(model.wv.most_similar('if'))

# Explore the similarity of "math" and "numpy"
print(model.wv.similarity('math', 'numpy'))


[('addition', 0.5029162168502808), ('subtract_only', 0.49672356247901917), ('other_tups', 0.47950538992881775), ('rs_res', 0.47513994574546814), ('case', 0.461989164352417), ('expires__', 0.4606761336326599), ('status_queries', 0.45825105905532837), ('use_item', 0.45625022053718567), ('argiter', 0.4531697630882263), ('admissions', 0.4510370194911957)]
[('elif', 0.694455623626709), ('expected_parsed', 0.5941764116287231), ('and', 0.5803670883178711), ('else', 0.5770476460456848), ('not', 0.5373891592025757), ('continue', 0.5011794567108154), ('or', 0.4985090494155884), ('break', 0.4727449119091034), ('sindexers', 0.4507202208042145), ('sleep_on_button', 0.4483819305896759)]
0.14585522


# Extension

### Frequency of Identifier Names from an Abstract Syntax Tree (AST)

In [9]:
# Using the python_crawler.py file to crawl through the repositories looking for Python files
import python_crawler

# Declaring the path of the 'Python Repositories' directory
path = 'Python Repositories'

# Creating an object of the PythonCrawler class
crawler = python_crawler.PythonCrawler(path)

# Crawl through the repositories
files = crawler.crawl()

import ast

identifiers = []

for file in files:

    if file != 'Python Repositories/NewsBlur/flask_monitor/flask_settings.py' and file != '/Users/chandrachudgowda/Library/Mobile Documents/com~apple~CloudDocs/Colby/Spring 2023/CS421/Projects/Project 2/CS421-cmg-project2/Python Repositories/NewsBlur/vendor/cjson/jsontest.py':

        with open(file, 'r') as f:

            print(f'Processing {file}...')

            code = f.read() 

            # FIxing the inconsistent use of tabs and spaces in the code
            code = code.replace('\t', '    ')

            # get the abstract syntax tree of the file
            ast_tree = ast.parse(code)

            # get a list of all identifiers in the ast tree
            for node in ast.walk(ast_tree):
                if isinstance(node, ast.Name):
                    identifiers.append(node.id)

Processing Python Repositories/flask/tests/test_basic.py...
Processing Python Repositories/flask/tests/conftest.py...
Processing Python Repositories/flask/tests/test_converters.py...
Processing Python Repositories/flask/tests/test_logging.py...
Processing Python Repositories/flask/tests/test_signals.py...
Processing Python Repositories/flask/tests/test_async.py...
Processing Python Repositories/flask/tests/test_session_interface.py...
Processing Python Repositories/flask/tests/test_instance_config.py...
Processing Python Repositories/flask/tests/test_views.py...
Processing Python Repositories/flask/tests/test_json_tag.py...
Processing Python Repositories/flask/tests/test_subclassing.py...
Processing Python Repositories/flask/tests/test_reqctx.py...
Processing Python Repositories/flask/tests/test_blueprints.py...
Processing Python Repositories/flask/tests/test_config.py...
Processing Python Repositories/flask/tests/test_user_error_handler.py...
Processing Python Repositories/flask/tests

In [11]:
# Find the 50 most common identifiers in Python from the frequencies you calculated in the previous step.
from collections import Counter

# Find the 50 most common identifiers in Python from the frequencies you calculated in the previous step.
counter = Counter(identifiers)

# Print the 50 most common identifiers in Python
print(counter.most_common(50))

# Using the Python2Vec model you built previously, calculate the similarity between each pair of the 50 most common identifiers and highlight interesting patterns in your report.

# Load the Python2Vec model
model = gensim.models.Word2Vec.load('python2vec.model')

# Print the similarity between each pair of the 50 most common identifiers
for i in range(50):
    for j in range(i+1, 50):
        # Check if key exists in the model
        if counter.most_common(50)[i][0] in model.wv and counter.most_common(50)[j][0] in model.wv:
            print(f'Similarity between {counter.most_common(50)[i][0]} and {counter.most_common(50)[j][0]}: {model.wv.similarity(counter.most_common(50)[i][0], counter.most_common(50)[j][0])}')
        

# Enhance the way you flag identifiers as similar/dissimilar until you are satisfied with the results.




[('self', 354702), ('np', 117170), ('ret', 54834), ('result', 50163), ('x', 40059), ('pytest', 35692), ('expected', 33334), ('name', 32680), ('data', 29716), ('str', 27421), ('response', 22811), ('a', 22731), ('df', 22249), ('len', 21876), ('y', 20949), ('X', 20894), ('tm', 19717), ('os', 19119), ('kwargs', 18669), ('i', 18370), ('patch', 17948), ('ValueError', 17555), ('isinstance', 16572), ('key', 16512), ('msg', 16289), ('salt', 16239), ('value', 15190), ('res', 14970), ('ax', 13820), ('list', 13356), ('path', 12990), ('MagicMock', 12730), ('assert_equal', 12573), ('int', 12489), ('args', 12326), ('n', 12008), ('b', 11999), ('models', 11830), ('log', 11819), ('f', 11662), ('s', 11614), ('request', 11488), ('k', 10679), ('DataFrame', 10433), ('c', 10369), ('datetime', 9982), ('p', 9667), ('obj', 9558), ('pd', 9312), ('range', 9309)]
Similarity between self and np: 0.03792950510978699
Similarity between self and ret: 0.13570013642311096
Similarity between self and result: 0.0753111839