# pymongo基本操作

### 创建数据库

In [None]:
import pymongo

# 仅仅创建mongodb连接,但未验证,没有连接到具体的数据库,相当于打开一个shell
client = pymongo.MongoClient(host='127.0.0.1', port=27017)
# 创建数据库
db = pymongo.database.Database(client, 'testdb')
db.command("updateUser", "arvin", pwd="arvin123")

# 创建管理员用户,密码和权限
db.command("createUser", "admin", pwd="admin@123456.", roles=["root"])
# 创建普通用户,密码和读取权限
db.command("createUser", "arvin", pwd="arvin@123456.", roles=["readWrite"])

### 检测mongodb服务是否可用

In [42]:
from pymongo import MongoClient
from pymongo.errors import ConnectionFailure

client = MongoClient()

try:
    # The ismaster command is cheap and does not require auth.
    client.admin.command('ismaster')
    print('Server is available')
except ConnectionFailure:
    print("Server is not available")

Server is available


### 连接mongodb(无auth验证)

无验证式连接数据库通常是在第一次连接数据库时使用到,此时没有建立数据库,没有配置用户角色和读写权限.

In [26]:
import pymongo

# 仅仅创建mongodb连接,但未验证,没有连接到具体的数据库,相当于打开一个shell
client = pymongo.MongoClient(host='127.0.0.1', port=27017)
# 创建数据库
db = pymongo.database.Database(client, 'testdb')
db.command("updateUser", "arvin", pwd="arvin123")

# 创建管理员用户,密码和权限
db.command("createUser", "admin", pwd="admin@123456.", roles=["root"])
# 创建普通用户,密码和读取权限
db.command("createUser", "arvin", pwd="arvin@123456.", roles=["readWrite"])
# 创建普通用户,密码和只读权限
db.command("createUser", "xiaoming", pwd="xiaoming@123456.", roles=["read"])
# 创建普通用户,密码和只写权限
db.command("createUser", "xiaohong", pwd="xiaohong@123456.", roles=["write"])
# 修改用户密码
db.command("updateUser", "xiaoming", pwd="xiaoming@123.")
# 修改用户读取权限
db.command("updateUser", "xiaoming", roles=["readWrite"])

{'ok': 1.0}

### mongodb连接(带有auth验证)

pymongo是线程安全的,但是不是fork-safe. 当多进程使用同一个client实例会报错:

**解决办法:** 在连接mongo数据库时,设置关键字`connect=False`,即在mongodb实例化时不建立连接,等到有数据库操作才进行数据库连接.

In [None]:
import pymongo

try:
    # Python 3.x
    from urllib.parse import quote_plus
except ImportError:
    # Python 2.x
    from urllib import quote_plus

MONGODB = {
    "user": "arvin",
    "passwd": "arvin@123456.",
    "host": "127.0.0.1:27017",
    "dbname": "testdb"
}

# 方式一
# 以类的方式创建mongodb的实例
class ConnectMongo(object):
    
        def __init__(self, MONGODB):
        self.client = MongoClient(
            'mongodb://{}:{}@{}/{}'.format(MONGODB['user'],
                                           MONGODB['passwd'],
                                           MONGODB['host'],
                                           MONGODB['dbname']),
            connect=False
        )
        self.mongodb = self.client[MONGODB['dbname']]

    def __del__(self):
        self.client.close()
        
# 方式二        
def connect_mongo(MONGODB):
    client = MongoClient(
        'mongodb://{}:{}@{}/{}'.format(MONGODB['user'],
                                       MONGODB['passwd'],
                                       MONGODB['host'],
                                       MONGODB['dbname']),
        connect=False)
    mongodb = client[MONGODB['dbname']]
    return mongodb

# 方式三: 处理特殊字符 
def connect_mongo(mongo_param):
    uri =  'mongodb://{}:{}@{}/{}'.format(quote_plus(mongo_param['user']),
                                          quote_plus(mongo_param['passwd']),
                                          mongo_param['host'],
                                          mongo_param['dbname'])
    try:
        client = pymongo.MongoClient(uri, connect=False)
        mongodb = client[mongo_param['dbname']]
        return mongodb
    except Exception as e:
        print(e)

**注意:** 
1. 方式二存在内存泄漏问题,因为创建的client没有在程序结束后close掉.
2. 当用户名或密码带有 ‘:’, ‘/’, ‘+’ 和 ‘@’ 这些字符时需要使用百分比编码,否则连接报错: `Username and password must be escaped according to RFC 3986, use urllib.parse.quote_plus().`

### 插入数据(insert)

In [34]:
import random
import pymongo

MONGODB = {
    "user": "arvin",
    "passwd": "arvin123",
    "host": "127.0.0.1:27017",
    "dbname": "testdb"
}


def connect_mongo(mongo_param):
    uri =  'mongodb://{}:{}@{}/{}'.format(quote_plus(mongo_param['user']),
                                          quote_plus(mongo_param['passwd']),
                                          mongo_param['host'],
                                          mongo_param['dbname'])
    try:
        client = pymongo.MongoClient(uri, connect=False)
        mongodb = client[mongo_param['dbname']]
        return mongodb
    except Exception as e:
        print(e)
        
db = connect_mongo(MONGODB)

for i in range(10):
    name = ['xiaoming', 'frank', 'tom', 'jack', 'bomb']
    age = [18, 20, 19, 21, 22, 26, 24]
    weight = [120, 150 , 142, 123, 126, 135, 134]
    doc = {'name': random.choice(name), 'age':random.choice(age), 'weight': random.choice(weight), 'sex': 'male'}
    db['info'].insert_one(doc)

# 插入一条数据会返回一个ObjectId对象,ObjectId('5af8f8c62b5eba0bb8fece6d')    
result = movie.insert_one(doc)



### 删除数据(delete)

In [33]:
import pymongo

MONGODB = {
    "user": "arvin",
    "passwd": "arvin123",
    "host": "127.0.0.1:27017",
    "dbname": "testdb"
}


def connect_mongo(mongo_param):
    uri =  'mongodb://{}:{}@{}/{}'.format(quote_plus(mongo_param['user']),
                                          quote_plus(mongo_param['passwd']),
                                          mongo_param['host'],
                                          mongo_param['dbname'])
    try:
        client = pymongo.MongoClient(uri, connect=False)
        mongodb = client[mongo_param['dbname']]
        return mongodb
    except Exception as e:
        print(e)
        
db = connect_mongo(MONGODB)

# 删除一条数据
db['res_score'].delete_one({})

# 删除全部数据
db['res_score'].delete_many({})


<pymongo.results.DeleteResult at 0x200524df5c8>

### 更新数据(update)

In [41]:
import pymongo

MONGODB = {
    "user": "arvin",
    "passwd": "arvin123",
    "host": "127.0.0.1:27017",
    "dbname": "testdb"
}


def connect_mongo(mongo_param):
    uri =  'mongodb://{}:{}@{}/{}'.format(quote_plus(mongo_param['user']),
                                          quote_plus(mongo_param['passwd']),
                                          mongo_param['host'],
                                          mongo_param['dbname'])
    try:
        client = pymongo.MongoClient(uri, connect=False)
        mongodb = client[mongo_param['dbname']]
        return mongodb
    except Exception as e:
        print(e)
        
db = connect_mongo(MONGODB)
movie = db['info']

# 更新一条数据,局部更新(更新某一个字段)
result = movie.update_one({'name': 'jack'}, {'$set': {'weight': 132}})

print(result.matched_count)
print(result.modified_count)

# 更新多条数据
result = movie.update_many({'name': 'frank'}, {'$set': {'age': 32}})

print(result.matched_count)
print(result.modified_count)

# 更新整个文档
movie.update_one({'_id': item.get('_id')}, {' 除了_id字段的整个doc'})

1
0
3
3


### 查询数据

#### 查询指定字段最新一条记录

In [None]:
res = movie.find({}).sort('_id', -1).limit(1)
if res:
    pprint(res[0])

**注意: ** limit返回的是一个迭代器,并不是数据结果

#### 查询结果排序

In [None]:
db.Account.find({}).sort("UserName")  --默认为升序
db.Account.find({}).sort("UserName",pymongo.ASCENDING)   --升序
db.Account.find({}).sort("UserName",pymongo.DESCENDING)  --降序
# 或者
db.Account.find({}).sort("UserName", 1)   --升序
db.Account.find({}).sort("UserName", -1)  --降序

#### 多列结果排序

In [None]:
db.Account.find().sort([("UserName",pymongo.ASCENDING),("Email",pymongo.DESCENDING)])

#### 嵌套查询

In [None]:
# 查询doc中key为response且response中key为status_code=302 记录的数目
db.getCollection('urls').find({'response.status_code': 302}).count()

#### 查询返回部分字段,不返回整个文档

查询返回部分字段,不返回整个文档可以节省流量,降低带宽

* 第一个参数为查询条件，空代表查询所有文档
* 第二个参数中 1 代表选取该字段的值, 0代表过滤该字段的值

如果需要输出的字段比较多，不想要某个字段，可以用排除字段的方法 


In [None]:
# 返回所有含有title和type字段的值,并不是包含这些字段的所有文档
db.news.find( {}, {'title': 1,  'type': 1} )

# 查询返回
db.news.find( {}, {'title': 1,  'type': 0} )

db.inventory.find( { type: 'food' }, { type:0 })


# 不输出内容字段，其它字段都输出
db.news.find( {}, {content: 0 } )

### map-reduce

In [None]:
from bson.code import Code

mapfunc = Code('''function(){emit(this.tel, {platform: this.platform,time:this.time})}''')
reducefunc = Code(
    '''function(key, values){var time = 0;var platform = "";values.forEach(function(doc){if(doc.time>time){time=doc.time;platform=doc.platform;}});return {platform: platform,time:time};}''')
finalfunc = Code('''function(key, val){val.tel = key;return val;}''')

movie.map_reduce(mapfunc, reducefunc, out="collection_name", query={
                 "tel": {"$regex": '^[0-9]+$'}, "time": {"$gte": 0}}, finalize=finalfunc)

In [56]:
import random
import pymongo

MONGODB = {
    "user": "arvin",
    "passwd": "arvin123",
    "host": "127.0.0.1:27017",
    "dbname": "testdb"
}


def connect_mongo(mongo_param):
    uri =  'mongodb://{}:{}@{}/{}'.format(quote_plus(mongo_param['user']),
                                          quote_plus(mongo_param['passwd']),
                                          mongo_param['host'],
                                          mongo_param['dbname'])
    try:
        client = pymongo.MongoClient(uri, connect=False)
        mongodb = client[mongo_param['dbname']]
        return mongodb
    except Exception as e:
        print(e)
        
db = connect_mongo(MONGODB)

doc = {'site': 'https://www.xxxxxx.com'}
res = db['score'].insert_one(doc)


try:
    for year in [2018, 2019, 2020]:
        for month in range(1, 13):
            for day in range(1, 30):
                date_str = '{}-{}-{}'.format(year, month, day)
                data = [random.choice(range(30, 99)) for i in range(4)]
                db['score'].update_one({'_id': res.inserted_id}, {'$set': {date_str: data}}, upsert=True)
except Exception as e:
    print(e)
                

# for i in range(10):
#     name = ['xiaoming', 'frank', 'tom', 'jack', 'bomb']
#     age = [18, 20, 19, 21, 22, 26, 24]
#     weight = [120, 150 , 142, 123, 126, 135, 134]
#     doc = {'name': random.choice(name), 'age':random.choice(age), 'weight': random.choice(weight), 'sex': 'male'}
#     db['info'].insert_one(doc)

# # 插入一条数据会返回一个ObjectId对象,ObjectId('5af8f8c62b5eba0bb8fece6d')    
# result = movie.insert_one(doc)


In [4]:
import pymongo

# 仅仅创建mongodb连接,但未验证,没有连接到具体的数据库,相当于打开一个shell
client = pymongo.MongoClient(host='192.168.1.141', port=27017)
# 创建数据库
db = pymongo.database.Database(client, 'goshawk_test')
# db.command("createUser", "Goshawk", pwd="yYZmrjJ8xauAe#t7", roles=["readWrite"])

# 创建管理员用户,密码和权限
# db.command("createUser", "admin", pwd="admin@123456.", roles=["root"])
# 创建普通用户,密码和读取权限
# db.command("createUser", "arvin", pwd="arvin@123456.", roles=["readWrite"])

In [10]:
from itertools import zip_longest

data = b'Aug 30 14:53:39 ubuntu sshd[7636]: Accepted password for root from 192.168.187.1 port 7436 ssh2\n'
token = 'b2ddbd9212bf6c62cd658fab8c9ddd0a'
platform = 'linux'

def preprocess_data(data, platform):
    data_list = []
    lines = []
    if platform == 'windows':
        lines = data.split(b'\r\n')
    elif platform == 'linux':
        lines = data.split(b'\n')
    for line in lines:
        if line:
            try:
                str_line = line.decode()
            except:
                logging.error("data: {}, decode error: can't decode with utf-8.".format(line))
                str_line = str(line)[2:-1]
            data_list.append(str_line.strip())
    return data_list


def _handle_ssh_detail(line):
    str_time = line[:16]
    data = {'time': str_time}
    KEYS = ['status', 'user', 'IP', 'port']
    content = line[line.rfind(':')+1:].strip()
    details = content.split(' ')
    details = [item for item in details if details]
    print(details)
    details.pop(details.index('for')-1)
    details.pop(details.index('for'))
    details.pop(details.index('from'))
    details.pop(details.index('port')+2)
    details.pop(details.index('port'))
    for k, v in zip_longest(KEYS, details, fillvalue='_'):
        data[k] = v
    print(details)
    return data


def handle_ssh_data(data, platform, token):
    print(data)
    data_list = []
    lines = []
    for line in preprocess_data(data, platform):
        if 'sshd' in line and 'session' not in line:
            lines.append(line)

    for line in lines:
        data_item = _handle_ssh_detail(line)
        data_item['token'] = token
        data_list.append(data_item)
    print('ssh data: ', data_list)

handle_ssh_data(data, platform, token)

b'Aug 30 14:53:39 ubuntu sshd[7636]: Accepted password for root from 192.168.187.1 port 7436 ssh2\n'
['Accepted', 'password', 'for', 'root', 'from', '192.168.187.1', 'port', '7436', 'ssh2']
['Accepted', 'root', '192.168.187.1', '7436']
ssh data:  [{'time': 'Aug 30 14:53:39 ', 'status': 'Accepted', 'user': 'root', 'IP': '192.168.187.1', 'port': '7436', 'token': 'b2ddbd9212bf6c62cd658fab8c9ddd0a'}]


### mongo分页操作

db.collection.find({}).slip(index).limit(num)

### 值大小比较过滤

```
cursor = db['sys'].find({'token': token, 'time': {'$gt': start}}, {'_id': 0, 'average': 1, 'time': 1})
cursor = db['sys'].find({'token': token, 'time': {'$lt': end}}, {'_id': 0, 'average': 1, 'time': 1})
cursor = db['sys'].find({'token': token, 'time': {'$gte': start}}, {'_id': 0, 'average': 1, 'time': 1})
cursor = db['sys'].find({'token': token, 'time': {'$lte': end}}, {'_id': 0, 'average': 1, 'time': 1})
cursor = db['sys'].find({'token': token, 'time': {'$gt': start, '$lt': end}}, {'_id': 0, 'average': 1, 'time': 1})
cursor = db['sys'].find({'token': token, 'time': {'$ne': start}}, {'_id': 0, 'average': 1, 'time': 1})
```

### 根据字段排序

pymongo中排序(默认为升序):
* 升序: pymongo.ASCENDING = 1
* 降序: pymongo.DESCENDING = -1

单个字段排序:

```
db.Account.find().sort("UserName")  --默认为升序
db.Account.find().sort("UserName",pymongo.ASCENDING)   --升序
db.Account.find().sort("UserName",pymongo.DESCENDING)  --降序
```

多字段排序:
```
db.Account.find().sort([("UserName", pymongo.ASCENDING),("Email",pymongo.DESCENDING)])
```

### in 和 not in (`$in $nin`) 判断数据是否存在某个集合

```
db.collection.find({"field":{$in:array}});


db.things.find({j:{$in: [2,4,6]}});
db.things.find({j:{$nin: [2,4,6]}});
```

### 取模运算($mod)

```
db.things.find("this.a%10 == 1")

db.things.find({a:{$mod:[10, 1]}})
```


4) 取模运算$mod

如下面的运算：
db.things.find( "this.a % 10 == 1")
可用$mod代替：

db.things.find( { a : { $mod : [ 10 , 1 ] } } )


5)  $all

$all和$in类似，但是他需要匹配条件内所有的值：

如有一个对象：

{ a: [ 1, 2, 3 ] }
下面这个条件是可以匹配的：

db.things.find( { a: { $all: [ 2, 3 ] } } );
但是下面这个条件就不行了：

db.things.find( { a: { $all: [ 2, 3, 4 ] } } );

6)  $size

$size是匹配数组内的元素数量的，如有一个对象：{a:["foo"]}，他只有一个元素：

下面的语句就可以匹配：db.things.find( { a : { $size: 1 } } );
官网上说不能用来匹配一个范围内的元素，如果想找$size<5之类的，他们建议创建一个字段来保存元素的数量。

You cannot use $size to find a range of sizes (for example: arrays with more than 1 element). If you need to query for a range, create an extra size field that you increment when you add elements.

7）$exists

$exists用来判断一个元素是否存在：

如：

db.things.find( { a : { $exists : true } } ); // 如果存在元素a,就返回
db.things.find( { a : { $exists : false } } ); // 如果不存在元素a，就返回

8)  $type

$type 基于 bson type来匹配一个元素的类型，像是按照类型ID来匹配，不过我没找到bson类型和id对照表。

db.things.find( { a : { $type : 2 } } ); // matches if a is a string
db.things.find( { a : { $type : 16 } } ); // matches if a is an int
9）正则表达式

mongo支持正则表达式，如：

db.customers.find( { name : /acme.*corp/i } ); // 后面的i的意思是区分大小写

10)  查询数据内的值

下面的查询是查询colors内red的记录，如果colors元素是一个数据,数据库将遍历这个数组的元素来查询。db.things.find( { colors : "red" } );

11) $elemMatch

如果对象有一个元素是数组，那么$elemMatch可以匹配内数组内的元素：

> t.find( { x : { $elemMatch : { a : 1, b : { $gt : 1 } } } } ) 
{ "_id" : ObjectId("4b5783300334000000000aa9"), 
"x" : [ { "a" : 1, "b" : 3 }, 7, { "b" : 99 }, { "a" : 11 } ]
}$elemMatch : { a : 1, b : { $gt : 1 } } 所有的条件都要匹配上才行。
注意，上面的语句和下面是不一样的。

> t.find( { "x.a" : 1, "x.b" : { $gt : 1 } } )
$elemMatch是匹配{ "a" : 1, "b" : 3 }，而后面一句是匹配{ "b" : 99 }, { "a" : 11 } 


12)  查询嵌入对象的值

db.postings.find( { "author.name" : "joe" } );
注意用法是author.name，用一个点就行了。更详细的可以看这个链接： dot notation

举个例子：

> db.blog.save({ title : "My First Post", author: {name : "Jane", id : 1}})
如果我们要查询 authors name 是Jane的, 我们可以这样：

> db.blog.findOne({"author.name" : "Jane"})
如果不用点，那就需要用下面这句才能匹配：

db.blog.findOne({"author" : {"name" : "Jane", "id" : 1}})
下面这句：

db.blog.findOne({"author" : {"name" : "Jane"}})
是不能匹配的，因为mongodb对于子对象，他是精确匹配。

13) 元操作符 $not 取反

如：

db.customers.find( { name : { $not : /acme.*corp/i } } );db.things.find( { a : { $not : { $mod : [ 10 , 1 ] } } } ); mongodb还有很多函数可以用，如排序，统计等，请参考原文。

mongodb目前没有或(or)操作符，只能用变通的办法代替，可以参考下面的链接：

http://www.mongodb.org/display/DOCS/OR+operations+in+query+expressions